[2026-02-12 03:39:06,776] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:9815] baseline 0.000GB ()
[2026-02-12 03:39:06,782] [INFO] [axolotl.cli.config.load_cfg:259] [PID:9815] config:
{
  "activation_offloading": false,
  "adapter": "lora",
  "auto_resume_from_checkpoints": false,
  "axolotl_config_path": "new_data_normal.yaml",
  "base_model": "alpindale/Mistral-7B-v0.2-hf",
  "base_model_config": "alpindale/Mistral-7B-v0.2-hf",
  "batch_size": 120,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_90",
    "fp8": true,
    "n_gpu": 1,
    "n_node": 1
  },
  "chat_template": "chatml",
  "context_parallel_size": 1,
  "dataloader_num_workers": 1,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_num_proc": 26,
  "dataset_prepared_path": "last_finetune_prepared",
  "datasets": [
    {
      "chat_template": "tokenizer_default",
      "field_messages": "conversations",
      "message_property_mappings": {
        "content": "value",
        "role": "from"
      },
      "path": "newsessions_revisedcutdown.jsonl",
      "trust_remote_code": false,
      "type": "chat_template"
    },
    {
      "chat_template": "tokenizer_default",
      "field_messages": "conversations",
      "message_property_mappings": {
        "content": "value",
        "role": "from"
      },
      "path": "hidden_compressedgamescontext_reducedctx.jsonl",
      "trust_remote_code": false,
      "type": "chat_template"
    },
    {
      "chat_template": "tokenizer_default",
      "field_messages": "conversations",
      "message_property_mappings": {
        "content": "value",
        "role": "from"
      },
      "path": "manual_gameplay_truncated.jsonl",
      "trust_remote_code": false,
      "type": "chat_template"
    },
    {
      "chat_template": "tokenizer_default",
      "field_messages": "conversations",
      "message_property_mappings": {
        "content": "value",
        "role": "from"
      },
      "path": "newdataplussilent-filtered.jsonl",
      "trust_remote_code": false,
      "type": "chat_template"
    },
    {
      "chat_template": "tokenizer_default",
      "field_messages": "conversations",
      "message_property_mappings": {
        "content": "value",
        "role": "from"
      },
      "path": "newprompt_lessreduced_1.jsonl",
      "trust_remote_code": false,
      "type": "chat_template"
    },
    {
      "chat_template": "tokenizer_default",
      "field_messages": "conversations",
      "message_property_mappings": {
        "content": "value",
        "role": "from"
      },
      "path": "newprompt_lessreduced_2.jsonl",
      "trust_remote_code": false,
      "type": "chat_template"
    },
    {
      "chat_template": "tokenizer_default",
      "field_messages": "conversations",
      "message_property_mappings": {
        "content": "value",
        "role": "from"
      },
      "path": "newprompt_lessreduced_orig.jsonl",
      "trust_remote_code": false,
      "type": "chat_template"
    },
    {
      "chat_template": "tokenizer_default",
      "field_messages": "conversations",
      "message_property_mappings": {
        "content": "value",
        "role": "from"
      },
      "path": "generics/bluemoon-2_5mil.jsonl",
      "trust_remote_code": false,
      "type": "chat_template"
    },
    {
      "chat_template": "tokenizer_default",
      "field_messages": "conversations",
      "message_property_mappings": {
        "content": "value",
        "role": "from"
      },
      "path": "generics/capybara_2_5mil.jsonl",
      "trust_remote_code": false,
      "type": "chat_template"
    },
    {
      "chat_template": "tokenizer_default",
      "field_messages": "conversations",
      "message_property_mappings": {
        "content": "value",
        "role": "from"
      },
      "path": "generics/qwq_3million.jsonl",
      "trust_remote_code": false,
      "type": "chat_template"
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "eaft_alpha": 1.0,
  "eaft_k": 20,
  "env_capabilities": {
    "torch_version": "2.9.1"
  },
  "eval_batch_size": 1,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_sample_packing": false,
  "eval_table_size": 0,
  "evals_per_epoch": 1,
  "experimental_skip_move_to_device": true,
  "flash_attention": true,
  "fp16": false,
  "gradient_accumulation_steps": 20,
  "gradient_checkpointing": true,
  "gradient_checkpointing_kwargs": {
    "use_reentrant": true
  },
  "group_by_length": false,
  "hub_model_id": "Heralax/dpn-newdata-normal-actual",
  "hub_strategy": "all_checkpoints",
  "include_tkps": true,
  "is_falcon_derived_model": false,
  "is_llama_derived_model": false,
  "is_mistral_derived_model": true,
  "learning_rate": 0.0001,
  "liger_fused_linear_cross_entropy": true,
  "liger_glu_activation": true,
  "liger_layer_norm": true,
  "liger_rms_norm": true,
  "liger_rope": true,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 1,
  "lora_alpha": 512,
  "lora_dropout": 0.4,
  "lora_r": 128,
  "lora_target_linear": true,
  "lora_target_modules": [
    "gate_proj",
    "down_proj",
    "up_proj",
    "q_proj",
    "v_proj",
    "k_proj",
    "o_proj"
  ],
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "constant",
  "max_grad_norm": 1.0,
  "mean_resizing_embeddings": false,
  "micro_batch_size": 6,
  "model_config_type": "mistral",
  "noisy_embedding_alpha": 0.0,
  "num_epochs": 2.0,
  "optimizer": "paged_adamw_8bit",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "./finetune-model-output",
  "pad_to_sequence_len": true,
  "plugins": [
    "axolotl.integrations.liger.LigerPlugin"
  ],
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": false,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_strategy": "epoch",
  "seed": 1337,
  "sequence_len": 11000,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "special_tokens": {
    "eos_token": "</s>",
    "pad_token": "<unk>"
  },
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": false,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "alpindale/Mistral-7B-v0.2-hf",
  "tokenizer_save_jinja_files": true,
  "tokenizer_type": "AutoTokenizer",
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "scale_rewards": true,
    "sync_ref_model": false,
    "use_vllm": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "type_of_model": "AutoModelForCausalLM",
  "use_otel_metrics": false,
  "use_ray": false,
  "use_wandb": true,
  "val_set_size": 0.0,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "wandb_entity": "",
  "wandb_log_model": "",
  "wandb_project": "diplonations",
  "wandb_run_id": "",
  "wandb_watch": "",
  "warmup_ratio": 0.1,
  "weight_decay": 0.01,
  "world_size": 1,
  "xformers_attention": false
}
[2026-02-12 03:39:08,679] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:9815] EOS: 2 / </s>
[2026-02-12 03:39:08,683] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:9815] BOS: 1 / <s>
[2026-02-12 03:39:08,686] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:9815] PAD: 0 / <unk>
[2026-02-12 03:39:08,687] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:9815] UNK: 0 / <unk>
[2026-02-12 03:39:08,738] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:9815] Loading prepared dataset from disk at last_finetune_prepared/d23474b21049f2b028a7ed7e051437f0...
Loading dataset from disk:   0%|                                                                                                                                                                                           | 0/26 [00:00<?, ?it/s]Loading dataset from disk:  19%|██████████████████████████████████▍                                                                                                                                                | 5/26 [00:00<00:00, 46.49it/s]Loading dataset from disk: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 171.45it/s]
[2026-02-12 03:39:25,960] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:417] [PID:9815] total_num_tokens: 248_541_569
[2026-02-12 03:39:47,545] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:435] [PID:9815] `total_supervised_tokens: 52_816_669`
[2026-02-12 03:39:47,548] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:533] [PID:9815] total_num_steps: 656
[2026-02-12 03:39:47,549] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:9815] Maximum number of steps set at 656
[2026-02-12 03:39:47,556] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:9815] loading tokenizer... alpindale/Mistral-7B-v0.2-hf
[2026-02-12 03:39:49,542] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:9815] EOS: 2 / </s>
[2026-02-12 03:39:49,545] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:9815] BOS: 1 / <s>
[2026-02-12 03:39:49,550] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:9815] PAD: 0 / <unk>
[2026-02-12 03:39:49,554] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:9815] UNK: 0 / <unk>
[2026-02-12 03:39:49,556] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:9815] Loading model
[2026-02-12 03:39:49,855] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:9815] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-02-12 03:39:49,860] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:9815] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-02-12 03:39:49,992] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:98] [PID:9815] Applying LIGER to mistral with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': True, 'rms_norm': True, 'swiglu': True}
Loading weights:   0%|                                                                                                                                                                                                    | 0/291 [00:00<?, ?it/s]Loading weights:   0%|▌                                                                                                                                                     | 1/291 [00:00<00:00, 8594.89it/s, Materializing param=lm_head.weight]Loading weights:   0%|▌                                                                                                                                                      | 1/291 [00:00<00:00, 486.35it/s, Materializing param=lm_head.weight]Loading weights:   1%|▉                                                                                                                                           | 2/291 [00:00<00:00, 329.06it/s, Materializing param=model.embed_tokens.weight]Loading weights:   1%|▉                                                                                                                                           | 2/291 [00:00<00:01, 214.83it/s, Materializing param=model.embed_tokens.weight]Loading weights:   1%|█▎                                                                                                                              | 3/291 [00:00<00:01, 202.42it/s, Materializing param=model.layers.0.input_layernorm.weight]Loading weights:   1%|█▎                                                                                                                              | 3/291 [00:00<00:01, 160.55it/s, Materializing param=model.layers.0.input_layernorm.weight]Loading weights:   1%|█▊                                                                                                                                | 4/291 [00:00<00:01, 176.83it/s, Materializing param=model.layers.0.mlp.down_proj.weight]Loading weights:   1%|█▊                                                                                                                                | 4/291 [00:00<00:01, 153.23it/s, Materializing param=model.layers.0.mlp.down_proj.weight]Loading weights:   2%|██▏                                                                                                                               | 5/291 [00:00<00:01, 164.21it/s, Materializing param=model.layers.0.mlp.gate_proj.weight]Loading weights:   2%|██▏                                                                                                                               | 5/291 [00:00<00:01, 150.78it/s, Materializing param=model.layers.0.mlp.gate_proj.weight]Loading weights:   2%|██▋                                                                                                                                 | 6/291 [00:00<00:01, 162.18it/s, Materializing param=model.layers.0.mlp.up_proj.weight]Loading weights:   2%|██▋                                                                                                                                 | 6/291 [00:00<00:01, 147.67it/s, Materializing param=model.layers.0.mlp.up_proj.weight]Loading weights:   2%|██▊                                                                                                                    | 7/291 [00:00<00:01, 158.96it/s, Materializing param=model.layers.0.post_attention_layernorm.weight]Loading weights:   2%|██▊                                                                                                                    | 7/291 [00:00<00:01, 145.27it/s, Materializing param=model.layers.0.post_attention_layernorm.weight]Loading weights:   3%|███▍                                                                                                                           | 8/291 [00:00<00:01, 156.05it/s, Materializing param=model.layers.0.self_attn.k_proj.weight]Loading weights:   3%|███▍                                                                                                                           | 8/291 [00:00<00:01, 146.74it/s, Materializing param=model.layers.0.self_attn.k_proj.weight]Loading weights:   3%|███▉                                                                                                                           | 9/291 [00:00<00:01, 153.82it/s, Materializing param=model.layers.0.self_attn.o_proj.weight]Loading weights:   3%|███▉                                                                                                                           | 9/291 [00:00<00:01, 144.82it/s, Materializing param=model.layers.0.self_attn.o_proj.weight]Loading weights:   3%|████▎                                                                                                                         | 10/291 [00:00<00:01, 150.11it/s, Materializing param=model.layers.0.self_attn.q_proj.weight]Loading weights:   3%|████▎                                                                                                                         | 10/291 [00:00<00:01, 143.96it/s, Materializing param=model.layers.0.self_attn.q_proj.weight]Loading weights:   4%|████▊                                                                                                                         | 11/291 [00:00<00:01, 142.14it/s, Materializing param=model.layers.0.self_attn.v_proj.weight]Loading weights:   4%|████▊                                                                                                                         | 11/291 [00:00<00:02, 135.70it/s, Materializing param=model.layers.0.self_attn.v_proj.weight]Loading weights:   4%|█████▏                                                                                                                         | 12/291 [00:00<00:01, 140.50it/s, Materializing param=model.layers.1.input_layernorm.weight]Loading weights:   4%|█████▏                                                                                                                         | 12/291 [00:00<00:02, 136.68it/s, Materializing param=model.layers.1.input_layernorm.weight]Loading weights:   4%|█████▊                                                                                                                           | 13/291 [00:00<00:01, 140.50it/s, Materializing param=model.layers.1.mlp.down_proj.weight]Loading weights:   4%|█████▊                                                                                                                           | 13/291 [00:00<00:02, 135.88it/s, Materializing param=model.layers.1.mlp.down_proj.weight]Loading weights:   5%|██████▏                                                                                                                          | 14/291 [00:00<00:01, 140.17it/s, Materializing param=model.layers.1.mlp.gate_proj.weight]Loading weights:   5%|██████▏                                                                                                                          | 14/291 [00:00<00:02, 134.51it/s, Materializing param=model.layers.1.mlp.gate_proj.weight]Loading weights:   5%|██████▋                                                                                                                          | 15/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.1.mlp.gate_proj.weight]Loading weights:   5%|██████▊                                                                                                                            | 15/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.1.mlp.up_proj.weight]Loading weights:   5%|██████▊                                                                                                                            | 15/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.1.mlp.up_proj.weight]Loading weights:   5%|██████▍                                                                                                               | 16/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.1.post_attention_layernorm.weight]Loading weights:   5%|██████▍                                                                                                               | 16/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.1.post_attention_layernorm.weight]Loading weights:   6%|███████▎                                                                                                                      | 17/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.1.self_attn.k_proj.weight]Loading weights:   6%|███████▎                                                                                                                      | 17/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.1.self_attn.k_proj.weight]Loading weights:   6%|███████▊                                                                                                                      | 18/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.1.self_attn.o_proj.weight]Loading weights:   6%|███████▊                                                                                                                      | 18/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.1.self_attn.o_proj.weight]Loading weights:   7%|████████▏                                                                                                                     | 19/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.1.self_attn.q_proj.weight]Loading weights:   7%|████████▏                                                                                                                     | 19/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.1.self_attn.q_proj.weight]Loading weights:   7%|████████▋                                                                                                                     | 20/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.1.self_attn.v_proj.weight]Loading weights:   7%|████████▋                                                                                                                     | 20/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.1.self_attn.v_proj.weight]Loading weights:   7%|█████████▏                                                                                                                     | 21/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.2.input_layernorm.weight]Loading weights:   7%|█████████▏                                                                                                                     | 21/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.2.input_layernorm.weight]Loading weights:   8%|█████████▊                                                                                                                       | 22/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.2.mlp.down_proj.weight]Loading weights:   8%|█████████▊                                                                                                                       | 22/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.2.mlp.down_proj.weight]Loading weights:   8%|██████████▏                                                                                                                      | 23/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.2.mlp.gate_proj.weight]Loading weights:   8%|██████████▏                                                                                                                      | 23/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.2.mlp.gate_proj.weight]Loading weights:   8%|██████████▊                                                                                                                        | 24/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.2.mlp.up_proj.weight]Loading weights:   8%|██████████▊                                                                                                                        | 24/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.2.mlp.up_proj.weight]Loading weights:   9%|██████████▏                                                                                                           | 25/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.2.post_attention_layernorm.weight]Loading weights:   9%|██████████▏                                                                                                           | 25/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.2.post_attention_layernorm.weight]Loading weights:   9%|███████████▎                                                                                                                  | 26/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.2.self_attn.k_proj.weight]Loading weights:   9%|███████████▎                                                                                                                  | 26/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.2.self_attn.k_proj.weight]Loading weights:   9%|███████████▋                                                                                                                  | 27/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.2.self_attn.o_proj.weight]Loading weights:   9%|███████████▋                                                                                                                  | 27/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.2.self_attn.o_proj.weight]Loading weights:  10%|████████████                                                                                                                  | 28/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.2.self_attn.q_proj.weight]Loading weights:  10%|████████████                                                                                                                  | 28/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.2.self_attn.q_proj.weight]Loading weights:  10%|████████████▌                                                                                                                 | 29/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.2.self_attn.v_proj.weight]Loading weights:  10%|████████████▌                                                                                                                 | 29/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.2.self_attn.v_proj.weight]Loading weights:  10%|█████████████                                                                                                                  | 30/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.3.input_layernorm.weight]Loading weights:  10%|█████████████                                                                                                                  | 30/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.3.input_layernorm.weight]Loading weights:  11%|█████████████▋                                                                                                                   | 31/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.3.mlp.down_proj.weight]Loading weights:  11%|█████████████▋                                                                                                                   | 31/291 [00:00<00:01, 138.12it/s, Materializing param=model.layers.3.mlp.down_proj.weight]Loading weights:  11%|██████████████▏                                                                                                                  | 32/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.3.mlp.down_proj.weight]Loading weights:  11%|██████████████▏                                                                                                                  | 32/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.3.mlp.gate_proj.weight]Loading weights:  11%|██████████████▏                                                                                                                  | 32/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.3.mlp.gate_proj.weight]Loading weights:  11%|██████████████▊                                                                                                                    | 33/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.3.mlp.up_proj.weight]Loading weights:  11%|██████████████▊                                                                                                                    | 33/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.3.mlp.up_proj.weight]Loading weights:  12%|█████████████▊                                                                                                        | 34/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.3.post_attention_layernorm.weight]Loading weights:  12%|█████████████▊                                                                                                        | 34/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.3.post_attention_layernorm.weight]Loading weights:  12%|███████████████▏                                                                                                              | 35/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.3.self_attn.k_proj.weight]Loading weights:  12%|███████████████▏                                                                                                              | 35/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.3.self_attn.k_proj.weight]Loading weights:  12%|███████████████▌                                                                                                              | 36/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.3.self_attn.o_proj.weight]Loading weights:  12%|███████████████▌                                                                                                              | 36/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.3.self_attn.o_proj.weight]Loading weights:  13%|████████████████                                                                                                              | 37/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.3.self_attn.q_proj.weight]Loading weights:  13%|████████████████                                                                                                              | 37/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.3.self_attn.q_proj.weight]Loading weights:  13%|████████████████▍                                                                                                             | 38/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.3.self_attn.v_proj.weight]Loading weights:  13%|████████████████▍                                                                                                             | 38/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.3.self_attn.v_proj.weight]Loading weights:  13%|█████████████████                                                                                                              | 39/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.4.input_layernorm.weight]Loading weights:  13%|█████████████████                                                                                                              | 39/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.4.input_layernorm.weight]Loading weights:  14%|█████████████████▋                                                                                                               | 40/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.4.mlp.down_proj.weight]Loading weights:  14%|█████████████████▋                                                                                                               | 40/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.4.mlp.down_proj.weight]Loading weights:  14%|██████████████████▏                                                                                                              | 41/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.4.mlp.gate_proj.weight]Loading weights:  14%|██████████████████▏                                                                                                              | 41/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.4.mlp.gate_proj.weight]Loading weights:  14%|██████████████████▉                                                                                                                | 42/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.4.mlp.up_proj.weight]Loading weights:  14%|██████████████████▉                                                                                                                | 42/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.4.mlp.up_proj.weight]Loading weights:  15%|█████████████████▍                                                                                                    | 43/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.4.post_attention_layernorm.weight]Loading weights:  15%|█████████████████▍                                                                                                    | 43/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.4.post_attention_layernorm.weight]Loading weights:  15%|███████████████████                                                                                                           | 44/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.4.self_attn.k_proj.weight]Loading weights:  15%|███████████████████                                                                                                           | 44/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.4.self_attn.k_proj.weight]Loading weights:  15%|███████████████████▍                                                                                                          | 45/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.4.self_attn.o_proj.weight]Loading weights:  15%|███████████████████▍                                                                                                          | 45/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.4.self_attn.o_proj.weight]Loading weights:  16%|███████████████████▉                                                                                                          | 46/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.4.self_attn.q_proj.weight]Loading weights:  16%|███████████████████▉                                                                                                          | 46/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.4.self_attn.q_proj.weight]Loading weights:  16%|████████████████████▎                                                                                                         | 47/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.4.self_attn.v_proj.weight]Loading weights:  16%|████████████████████▎                                                                                                         | 47/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.4.self_attn.v_proj.weight]Loading weights:  16%|████████████████████▉                                                                                                          | 48/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.5.input_layernorm.weight]Loading weights:  16%|████████████████████▉                                                                                                          | 48/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.5.input_layernorm.weight]Loading weights:  17%|█████████████████████▋                                                                                                           | 49/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.5.mlp.down_proj.weight]Loading weights:  17%|█████████████████████▋                                                                                                           | 49/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.5.mlp.down_proj.weight]Loading weights:  17%|██████████████████████▏                                                                                                          | 50/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.5.mlp.gate_proj.weight]Loading weights:  17%|██████████████████████▏                                                                                                          | 50/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.5.mlp.gate_proj.weight]Loading weights:  18%|██████████████████████▉                                                                                                            | 51/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.5.mlp.up_proj.weight]Loading weights:  18%|██████████████████████▉                                                                                                            | 51/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.5.mlp.up_proj.weight]Loading weights:  18%|█████████████████████                                                                                                 | 52/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.5.post_attention_layernorm.weight]Loading weights:  18%|█████████████████████                                                                                                 | 52/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.5.post_attention_layernorm.weight]Loading weights:  18%|██████████████████████▉                                                                                                       | 53/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.5.self_attn.k_proj.weight]Loading weights:  18%|██████████████████████▉                                                                                                       | 53/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.5.self_attn.k_proj.weight]Loading weights:  19%|███████████████████████▍                                                                                                      | 54/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.5.self_attn.o_proj.weight]Loading weights:  19%|███████████████████████▍                                                                                                      | 54/291 [00:00<00:01, 155.06it/s, Materializing param=model.layers.5.self_attn.o_proj.weight]Loading weights:  19%|███████████████████████▊                                                                                                      | 55/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.5.self_attn.o_proj.weight]Loading weights:  19%|███████████████████████▊                                                                                                      | 55/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.5.self_attn.q_proj.weight]Loading weights:  19%|███████████████████████▊                                                                                                      | 55/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.5.self_attn.q_proj.weight]Loading weights:  19%|████████████████████████▏                                                                                                     | 56/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.5.self_attn.v_proj.weight]Loading weights:  19%|████████████████████████▏                                                                                                     | 56/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.5.self_attn.v_proj.weight]Loading weights:  20%|████████████████████████▉                                                                                                      | 57/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.6.input_layernorm.weight]Loading weights:  20%|████████████████████████▉                                                                                                      | 57/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.6.input_layernorm.weight]Loading weights:  20%|█████████████████████████▋                                                                                                       | 58/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.6.mlp.down_proj.weight]Loading weights:  20%|█████████████████████████▋                                                                                                       | 58/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.6.mlp.down_proj.weight]Loading weights:  20%|██████████████████████████▏                                                                                                      | 59/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.6.mlp.gate_proj.weight]Loading weights:  20%|██████████████████████████▏                                                                                                      | 59/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.6.mlp.gate_proj.weight]Loading weights:  21%|███████████████████████████                                                                                                        | 60/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.6.mlp.up_proj.weight]Loading weights:  21%|███████████████████████████                                                                                                        | 60/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.6.mlp.up_proj.weight]Loading weights:  21%|████████████████████████▋                                                                                             | 61/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.6.post_attention_layernorm.weight]Loading weights:  21%|████████████████████████▋                                                                                             | 61/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.6.post_attention_layernorm.weight]Loading weights:  21%|██████████████████████████▊                                                                                                   | 62/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.6.self_attn.k_proj.weight]Loading weights:  21%|██████████████████████████▊                                                                                                   | 62/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.6.self_attn.k_proj.weight]Loading weights:  22%|███████████████████████████▎                                                                                                  | 63/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.6.self_attn.o_proj.weight]Loading weights:  22%|███████████████████████████▎                                                                                                  | 63/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.6.self_attn.o_proj.weight]Loading weights:  22%|███████████████████████████▋                                                                                                  | 64/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.6.self_attn.q_proj.weight]Loading weights:  22%|███████████████████████████▋                                                                                                  | 64/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.6.self_attn.q_proj.weight]Loading weights:  22%|████████████████████████████▏                                                                                                 | 65/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.6.self_attn.v_proj.weight]Loading weights:  22%|████████████████████████████▏                                                                                                 | 65/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.6.self_attn.v_proj.weight]Loading weights:  23%|████████████████████████████▊                                                                                                  | 66/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.7.input_layernorm.weight]Loading weights:  23%|████████████████████████████▊                                                                                                  | 66/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.7.input_layernorm.weight]Loading weights:  23%|█████████████████████████████▋                                                                                                   | 67/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.7.mlp.down_proj.weight]Loading weights:  23%|█████████████████████████████▋                                                                                                   | 67/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.7.mlp.down_proj.weight]Loading weights:  23%|██████████████████████████████▏                                                                                                  | 68/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.7.mlp.gate_proj.weight]Loading weights:  23%|██████████████████████████████▏                                                                                                  | 68/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.7.mlp.gate_proj.weight]Loading weights:  24%|███████████████████████████████                                                                                                    | 69/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.7.mlp.up_proj.weight]Loading weights:  24%|███████████████████████████████                                                                                                    | 69/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.7.mlp.up_proj.weight]Loading weights:  24%|████████████████████████████▍                                                                                         | 70/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.7.post_attention_layernorm.weight]Loading weights:  24%|████████████████████████████▍                                                                                         | 70/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.7.post_attention_layernorm.weight]Loading weights:  24%|██████████████████████████████▋                                                                                               | 71/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.7.self_attn.k_proj.weight]Loading weights:  24%|██████████████████████████████▋                                                                                               | 71/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.7.self_attn.k_proj.weight]Loading weights:  25%|███████████████████████████████▏                                                                                              | 72/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.7.self_attn.o_proj.weight]Loading weights:  25%|███████████████████████████████▏                                                                                              | 72/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.7.self_attn.o_proj.weight]Loading weights:  25%|███████████████████████████████▌                                                                                              | 73/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.7.self_attn.q_proj.weight]Loading weights:  25%|███████████████████████████████▌                                                                                              | 73/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.7.self_attn.q_proj.weight]Loading weights:  25%|████████████████████████████████                                                                                              | 74/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.7.self_attn.v_proj.weight]Loading weights:  25%|████████████████████████████████                                                                                              | 74/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.7.self_attn.v_proj.weight]Loading weights:  26%|████████████████████████████████▋                                                                                              | 75/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.8.input_layernorm.weight]Loading weights:  26%|████████████████████████████████▋                                                                                              | 75/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.8.input_layernorm.weight]Loading weights:  26%|█████████████████████████████████▋                                                                                               | 76/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.8.mlp.down_proj.weight]Loading weights:  26%|█████████████████████████████████▋                                                                                               | 76/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.8.mlp.down_proj.weight]Loading weights:  26%|██████████████████████████████████▏                                                                                              | 77/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.8.mlp.gate_proj.weight]Loading weights:  26%|██████████████████████████████████▏                                                                                              | 77/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.8.mlp.gate_proj.weight]Loading weights:  27%|███████████████████████████████████                                                                                                | 78/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.8.mlp.up_proj.weight]Loading weights:  27%|███████████████████████████████████                                                                                                | 78/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.8.mlp.up_proj.weight]Loading weights:  27%|████████████████████████████████                                                                                      | 79/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.8.post_attention_layernorm.weight]Loading weights:  27%|████████████████████████████████                                                                                      | 79/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.8.post_attention_layernorm.weight]Loading weights:  27%|██████████████████████████████████▋                                                                                           | 80/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.8.self_attn.k_proj.weight]Loading weights:  27%|██████████████████████████████████▋                                                                                           | 80/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.8.self_attn.k_proj.weight]Loading weights:  28%|███████████████████████████████████                                                                                           | 81/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.8.self_attn.o_proj.weight]Loading weights:  28%|███████████████████████████████████                                                                                           | 81/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.8.self_attn.o_proj.weight]Loading weights:  28%|███████████████████████████████████▌                                                                                          | 82/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.8.self_attn.q_proj.weight]Loading weights:  28%|███████████████████████████████████▌                                                                                          | 82/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.8.self_attn.q_proj.weight]Loading weights:  29%|███████████████████████████████████▉                                                                                          | 83/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.8.self_attn.v_proj.weight]Loading weights:  29%|███████████████████████████████████▉                                                                                          | 83/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.8.self_attn.v_proj.weight]Loading weights:  29%|████████████████████████████████████▋                                                                                          | 84/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.9.input_layernorm.weight]Loading weights:  29%|████████████████████████████████████▋                                                                                          | 84/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.9.input_layernorm.weight]Loading weights:  29%|█████████████████████████████████████▋                                                                                           | 85/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.9.mlp.down_proj.weight]Loading weights:  29%|█████████████████████████████████████▋                                                                                           | 85/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.9.mlp.down_proj.weight]Loading weights:  30%|██████████████████████████████████████                                                                                           | 86/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.9.mlp.gate_proj.weight]Loading weights:  30%|██████████████████████████████████████                                                                                           | 86/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.9.mlp.gate_proj.weight]Loading weights:  30%|███████████████████████████████████████▏                                                                                           | 87/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.9.mlp.up_proj.weight]Loading weights:  30%|███████████████████████████████████████▏                                                                                           | 87/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.9.mlp.up_proj.weight]Loading weights:  30%|███████████████████████████████████▋                                                                                  | 88/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.9.post_attention_layernorm.weight]Loading weights:  30%|███████████████████████████████████▋                                                                                  | 88/291 [00:00<00:01, 187.88it/s, Materializing param=model.layers.9.post_attention_layernorm.weight]Loading weights:  31%|████████████████████████████████████                                                                                  | 89/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.9.post_attention_layernorm.weight]Loading weights:  31%|██████████████████████████████████████▌                                                                                       | 89/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.9.self_attn.k_proj.weight]Loading weights:  31%|██████████████████████████████████████▌                                                                                       | 89/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.9.self_attn.k_proj.weight]Loading weights:  31%|██████████████████████████████████████▉                                                                                       | 90/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.9.self_attn.o_proj.weight]Loading weights:  31%|██████████████████████████████████████▉                                                                                       | 90/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.9.self_attn.o_proj.weight]Loading weights:  31%|███████████████████████████████████████▍                                                                                      | 91/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.9.self_attn.q_proj.weight]Loading weights:  31%|███████████████████████████████████████▍                                                                                      | 91/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.9.self_attn.q_proj.weight]Loading weights:  32%|███████████████████████████████████████▊                                                                                      | 92/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.9.self_attn.v_proj.weight]Loading weights:  32%|███████████████████████████████████████▊                                                                                      | 92/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.9.self_attn.v_proj.weight]Loading weights:  32%|████████████████████████████████████████▎                                                                                     | 93/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.10.input_layernorm.weight]Loading weights:  32%|████████████████████████████████████████▎                                                                                     | 93/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.10.input_layernorm.weight]Loading weights:  32%|█████████████████████████████████████████▎                                                                                      | 94/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.10.mlp.down_proj.weight]Loading weights:  32%|█████████████████████████████████████████▎                                                                                      | 94/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.10.mlp.down_proj.weight]Loading weights:  33%|█████████████████████████████████████████▊                                                                                      | 95/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.10.mlp.gate_proj.weight]Loading weights:  33%|█████████████████████████████████████████▊                                                                                      | 95/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.10.mlp.gate_proj.weight]Loading weights:  33%|██████████████████████████████████████████▉                                                                                       | 96/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.10.mlp.up_proj.weight]Loading weights:  33%|██████████████████████████████████████████▉                                                                                       | 96/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.10.mlp.up_proj.weight]Loading weights:  33%|███████████████████████████████████████                                                                              | 97/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.10.post_attention_layernorm.weight]Loading weights:  33%|███████████████████████████████████████                                                                              | 97/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.10.post_attention_layernorm.weight]Loading weights:  34%|██████████████████████████████████████████                                                                                   | 98/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.10.self_attn.k_proj.weight]Loading weights:  34%|██████████████████████████████████████████                                                                                   | 98/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.10.self_attn.k_proj.weight]Loading weights:  34%|██████████████████████████████████████████▌                                                                                  | 99/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.10.self_attn.o_proj.weight]Loading weights:  34%|██████████████████████████████████████████▌                                                                                  | 99/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.10.self_attn.o_proj.weight]Loading weights:  34%|██████████████████████████████████████████▌                                                                                 | 100/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.10.self_attn.q_proj.weight]Loading weights:  34%|██████████████████████████████████████████▌                                                                                 | 100/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.10.self_attn.q_proj.weight]Loading weights:  35%|███████████████████████████████████████████                                                                                 | 101/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.10.self_attn.v_proj.weight]Loading weights:  35%|███████████████████████████████████████████                                                                                 | 101/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.10.self_attn.v_proj.weight]Loading weights:  35%|███████████████████████████████████████████▊                                                                                 | 102/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.11.input_layernorm.weight]Loading weights:  35%|███████████████████████████████████████████▊                                                                                 | 102/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.11.input_layernorm.weight]Loading weights:  35%|████████████████████████████████████████████▉                                                                                  | 103/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.11.mlp.down_proj.weight]Loading weights:  35%|████████████████████████████████████████████▉                                                                                  | 103/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.11.mlp.down_proj.weight]Loading weights:  36%|█████████████████████████████████████████████▍                                                                                 | 104/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.11.mlp.gate_proj.weight]Loading weights:  36%|█████████████████████████████████████████████▍                                                                                 | 104/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.11.mlp.gate_proj.weight]Loading weights:  36%|██████████████████████████████████████████████▌                                                                                  | 105/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.11.mlp.up_proj.weight]Loading weights:  36%|██████████████████████████████████████████████▌                                                                                  | 105/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.11.mlp.up_proj.weight]Loading weights:  36%|██████████████████████████████████████████▎                                                                         | 106/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.11.post_attention_layernorm.weight]Loading weights:  36%|██████████████████████████████████████████▎                                                                         | 106/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.11.post_attention_layernorm.weight]Loading weights:  37%|█████████████████████████████████████████████▌                                                                              | 107/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.11.self_attn.k_proj.weight]Loading weights:  37%|█████████████████████████████████████████████▌                                                                              | 107/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.11.self_attn.k_proj.weight]Loading weights:  37%|██████████████████████████████████████████████                                                                              | 108/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.11.self_attn.o_proj.weight]Loading weights:  37%|██████████████████████████████████████████████                                                                              | 108/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.11.self_attn.o_proj.weight]Loading weights:  37%|██████████████████████████████████████████████▍                                                                             | 109/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.11.self_attn.q_proj.weight]Loading weights:  37%|██████████████████████████████████████████████▍                                                                             | 109/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.11.self_attn.q_proj.weight]Loading weights:  38%|██████████████████████████████████████████████▊                                                                             | 110/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.11.self_attn.v_proj.weight]Loading weights:  38%|██████████████████████████████████████████████▊                                                                             | 110/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.11.self_attn.v_proj.weight]Loading weights:  38%|███████████████████████████████████████████████▋                                                                             | 111/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.12.input_layernorm.weight]Loading weights:  38%|███████████████████████████████████████████████▋                                                                             | 111/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.12.input_layernorm.weight]Loading weights:  38%|████████████████████████████████████████████████▉                                                                              | 112/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.12.mlp.down_proj.weight]Loading weights:  38%|████████████████████████████████████████████████▉                                                                              | 112/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.12.mlp.down_proj.weight]Loading weights:  39%|█████████████████████████████████████████████████▎                                                                             | 113/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.12.mlp.gate_proj.weight]Loading weights:  39%|█████████████████████████████████████████████████▎                                                                             | 113/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.12.mlp.gate_proj.weight]Loading weights:  39%|██████████████████████████████████████████████████▌                                                                              | 114/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.12.mlp.up_proj.weight]Loading weights:  39%|██████████████████████████████████████████████████▌                                                                              | 114/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.12.mlp.up_proj.weight]Loading weights:  40%|█████████████████████████████████████████████▊                                                                      | 115/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.12.post_attention_layernorm.weight]Loading weights:  40%|█████████████████████████████████████████████▊                                                                      | 115/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.12.post_attention_layernorm.weight]Loading weights:  40%|█████████████████████████████████████████████████▍                                                                          | 116/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.12.self_attn.k_proj.weight]Loading weights:  40%|█████████████████████████████████████████████████▍                                                                          | 116/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.12.self_attn.k_proj.weight]Loading weights:  40%|█████████████████████████████████████████████████▊                                                                          | 117/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.12.self_attn.o_proj.weight]Loading weights:  40%|█████████████████████████████████████████████████▊                                                                          | 117/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.12.self_attn.o_proj.weight]Loading weights:  41%|██████████████████████████████████████████████████▎                                                                         | 118/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.12.self_attn.q_proj.weight]Loading weights:  41%|██████████████████████████████████████████████████▎                                                                         | 118/291 [00:00<00:00, 245.32it/s, Materializing param=model.layers.12.self_attn.q_proj.weight]Loading weights:  41%|██████████████████████████████████████████████████▋                                                                         | 119/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.12.self_attn.q_proj.weight]Loading weights:  41%|██████████████████████████████████████████████████▋                                                                         | 119/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.12.self_attn.v_proj.weight]Loading weights:  41%|██████████████████████████████████████████████████▋                                                                         | 119/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.12.self_attn.v_proj.weight]Loading weights:  41%|███████████████████████████████████████████████████▌                                                                         | 120/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.13.input_layernorm.weight]Loading weights:  41%|███████████████████████████████████████████████████▌                                                                         | 120/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.13.input_layernorm.weight]Loading weights:  42%|████████████████████████████████████████████████████▊                                                                          | 121/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.13.mlp.down_proj.weight]Loading weights:  42%|████████████████████████████████████████████████████▊                                                                          | 121/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.13.mlp.down_proj.weight]Loading weights:  42%|█████████████████████████████████████████████████████▏                                                                         | 122/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.13.mlp.gate_proj.weight]Loading weights:  42%|█████████████████████████████████████████████████████▏                                                                         | 122/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.13.mlp.gate_proj.weight]Loading weights:  42%|██████████████████████████████████████████████████████▌                                                                          | 123/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.13.mlp.up_proj.weight]Loading weights:  42%|██████████████████████████████████████████████████████▌                                                                          | 123/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.13.mlp.up_proj.weight]Loading weights:  43%|█████████████████████████████████████████████████▍                                                                  | 124/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.13.post_attention_layernorm.weight]Loading weights:  43%|█████████████████████████████████████████████████▍                                                                  | 124/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.13.post_attention_layernorm.weight]Loading weights:  43%|█████████████████████████████████████████████████████▎                                                                      | 125/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.13.self_attn.k_proj.weight]Loading weights:  43%|█████████████████████████████████████████████████████▎                                                                      | 125/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.13.self_attn.k_proj.weight]Loading weights:  43%|█████████████████████████████████████████████████████▋                                                                      | 126/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.13.self_attn.o_proj.weight]Loading weights:  43%|█████████████████████████████████████████████████████▋                                                                      | 126/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.13.self_attn.o_proj.weight]Loading weights:  44%|██████████████████████████████████████████████████████                                                                      | 127/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.13.self_attn.q_proj.weight]Loading weights:  44%|██████████████████████████████████████████████████████                                                                      | 127/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.13.self_attn.q_proj.weight]Loading weights:  44%|██████████████████████████████████████████████████████▌                                                                     | 128/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.13.self_attn.v_proj.weight]Loading weights:  44%|██████████████████████████████████████████████████████▌                                                                     | 128/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.13.self_attn.v_proj.weight]Loading weights:  44%|███████████████████████████████████████████████████████▍                                                                     | 129/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.14.input_layernorm.weight]Loading weights:  44%|███████████████████████████████████████████████████████▍                                                                     | 129/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.14.input_layernorm.weight]Loading weights:  45%|████████████████████████████████████████████████████████▋                                                                      | 130/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.14.mlp.down_proj.weight]Loading weights:  45%|████████████████████████████████████████████████████████▋                                                                      | 130/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.14.mlp.down_proj.weight]Loading weights:  45%|█████████████████████████████████████████████████████████▏                                                                     | 131/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.14.mlp.gate_proj.weight]Loading weights:  45%|█████████████████████████████████████████████████████████▏                                                                     | 131/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.14.mlp.gate_proj.weight]Loading weights:  45%|██████████████████████████████████████████████████████████▌                                                                      | 132/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.14.mlp.up_proj.weight]Loading weights:  45%|██████████████████████████████████████████████████████████▌                                                                      | 132/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.14.mlp.up_proj.weight]Loading weights:  46%|█████████████████████████████████████████████████████                                                               | 133/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.14.post_attention_layernorm.weight]Loading weights:  46%|█████████████████████████████████████████████████████                                                               | 133/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.14.post_attention_layernorm.weight]Loading weights:  46%|█████████████████████████████████████████████████████████                                                                   | 134/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.14.self_attn.k_proj.weight]Loading weights:  46%|█████████████████████████████████████████████████████████                                                                   | 134/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.14.self_attn.k_proj.weight]Loading weights:  46%|█████████████████████████████████████████████████████████▌                                                                  | 135/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.14.self_attn.o_proj.weight]Loading weights:  46%|█████████████████████████████████████████████████████████▌                                                                  | 135/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.14.self_attn.o_proj.weight]Loading weights:  47%|█████████████████████████████████████████████████████████▉                                                                  | 136/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.14.self_attn.q_proj.weight]Loading weights:  47%|█████████████████████████████████████████████████████████▉                                                                  | 136/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.14.self_attn.q_proj.weight]Loading weights:  47%|██████████████████████████████████████████████████████████▍                                                                 | 137/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.14.self_attn.v_proj.weight]Loading weights:  47%|██████████████████████████████████████████████████████████▍                                                                 | 137/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.14.self_attn.v_proj.weight]Loading weights:  47%|███████████████████████████████████████████████████████████▎                                                                 | 138/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.15.input_layernorm.weight]Loading weights:  47%|███████████████████████████████████████████████████████████▎                                                                 | 138/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.15.input_layernorm.weight]Loading weights:  48%|████████████████████████████████████████████████████████████▋                                                                  | 139/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.15.mlp.down_proj.weight]Loading weights:  48%|████████████████████████████████████████████████████████████▋                                                                  | 139/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.15.mlp.down_proj.weight]Loading weights:  48%|█████████████████████████████████████████████████████████████                                                                  | 140/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.15.mlp.gate_proj.weight]Loading weights:  48%|█████████████████████████████████████████████████████████████                                                                  | 140/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.15.mlp.gate_proj.weight]Loading weights:  48%|██████████████████████████████████████████████████████████████▌                                                                  | 141/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.15.mlp.up_proj.weight]Loading weights:  48%|██████████████████████████████████████████████████████████████▌                                                                  | 141/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.15.mlp.up_proj.weight]Loading weights:  49%|████████████████████████████████████████████████████████▌                                                           | 142/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.15.post_attention_layernorm.weight]Loading weights:  49%|████████████████████████████████████████████████████████▌                                                           | 142/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.15.post_attention_layernorm.weight]Loading weights:  49%|████████████████████████████████████████████████████████████▉                                                               | 143/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.15.self_attn.k_proj.weight]Loading weights:  49%|████████████████████████████████████████████████████████████▉                                                               | 143/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.15.self_attn.k_proj.weight]Loading weights:  49%|█████████████████████████████████████████████████████████████▎                                                              | 144/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.15.self_attn.o_proj.weight]Loading weights:  49%|█████████████████████████████████████████████████████████████▎                                                              | 144/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.15.self_attn.o_proj.weight]Loading weights:  50%|█████████████████████████████████████████████████████████████▊                                                              | 145/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.15.self_attn.q_proj.weight]Loading weights:  50%|█████████████████████████████████████████████████████████████▊                                                              | 145/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.15.self_attn.q_proj.weight]Loading weights:  50%|██████████████████████████████████████████████████████████████▏                                                             | 146/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.15.self_attn.v_proj.weight]Loading weights:  50%|██████████████████████████████████████████████████████████████▏                                                             | 146/291 [00:00<00:00, 264.30it/s, Materializing param=model.layers.15.self_attn.v_proj.weight]Loading weights:  51%|██████████████████████████████████████████████████████████████▋                                                             | 147/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.15.self_attn.v_proj.weight]Loading weights:  51%|███████████████████████████████████████████████████████████████▏                                                             | 147/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.16.input_layernorm.weight]Loading weights:  51%|███████████████████████████████████████████████████████████████▏                                                             | 147/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.16.input_layernorm.weight]Loading weights:  51%|████████████████████████████████████████████████████████████████▌                                                              | 148/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.16.mlp.down_proj.weight]Loading weights:  51%|████████████████████████████████████████████████████████████████▌                                                              | 148/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.16.mlp.down_proj.weight]Loading weights:  51%|█████████████████████████████████████████████████████████████████                                                              | 149/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.16.mlp.gate_proj.weight]Loading weights:  51%|█████████████████████████████████████████████████████████████████                                                              | 149/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.16.mlp.gate_proj.weight]Loading weights:  52%|██████████████████████████████████████████████████████████████████▍                                                              | 150/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.16.mlp.up_proj.weight]Loading weights:  52%|██████████████████████████████████████████████████████████████████▍                                                              | 150/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.16.mlp.up_proj.weight]Loading weights:  52%|████████████████████████████████████████████████████████████▏                                                       | 151/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.16.post_attention_layernorm.weight]Loading weights:  52%|████████████████████████████████████████████████████████████▏                                                       | 151/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.16.post_attention_layernorm.weight]Loading weights:  52%|████████████████████████████████████████████████████████████████▊                                                           | 152/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.16.self_attn.k_proj.weight]Loading weights:  52%|████████████████████████████████████████████████████████████████▊                                                           | 152/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.16.self_attn.k_proj.weight]Loading weights:  53%|█████████████████████████████████████████████████████████████████▏                                                          | 153/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.16.self_attn.o_proj.weight]Loading weights:  53%|█████████████████████████████████████████████████████████████████▏                                                          | 153/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.16.self_attn.o_proj.weight]Loading weights:  53%|█████████████████████████████████████████████████████████████████▌                                                          | 154/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.16.self_attn.q_proj.weight]Loading weights:  53%|█████████████████████████████████████████████████████████████████▌                                                          | 154/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.16.self_attn.q_proj.weight]Loading weights:  53%|██████████████████████████████████████████████████████████████████                                                          | 155/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.16.self_attn.v_proj.weight]Loading weights:  53%|██████████████████████████████████████████████████████████████████                                                          | 155/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.16.self_attn.v_proj.weight]Loading weights:  54%|███████████████████████████████████████████████████████████████████                                                          | 156/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.17.input_layernorm.weight]Loading weights:  54%|███████████████████████████████████████████████████████████████████                                                          | 156/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.17.input_layernorm.weight]Loading weights:  54%|████████████████████████████████████████████████████████████████████▌                                                          | 157/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.17.mlp.down_proj.weight]Loading weights:  54%|████████████████████████████████████████████████████████████████████▌                                                          | 157/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.17.mlp.down_proj.weight]Loading weights:  54%|████████████████████████████████████████████████████████████████████▉                                                          | 158/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.17.mlp.gate_proj.weight]Loading weights:  54%|████████████████████████████████████████████████████████████████████▉                                                          | 158/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.17.mlp.gate_proj.weight]Loading weights:  55%|██████████████████████████████████████████████████████████████████████▍                                                          | 159/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.17.mlp.up_proj.weight]Loading weights:  55%|██████████████████████████████████████████████████████████████████████▍                                                          | 159/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.17.mlp.up_proj.weight]Loading weights:  55%|███████████████████████████████████████████████████████████████▊                                                    | 160/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.17.post_attention_layernorm.weight]Loading weights:  55%|███████████████████████████████████████████████████████████████▊                                                    | 160/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.17.post_attention_layernorm.weight]Loading weights:  55%|████████████████████████████████████████████████████████████████████▌                                                       | 161/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.17.self_attn.k_proj.weight]Loading weights:  55%|████████████████████████████████████████████████████████████████████▌                                                       | 161/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.17.self_attn.k_proj.weight]Loading weights:  56%|█████████████████████████████████████████████████████████████████████                                                       | 162/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.17.self_attn.o_proj.weight]Loading weights:  56%|█████████████████████████████████████████████████████████████████████                                                       | 162/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.17.self_attn.o_proj.weight]Loading weights:  56%|█████████████████████████████████████████████████████████████████████▍                                                      | 163/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.17.self_attn.q_proj.weight]Loading weights:  56%|█████████████████████████████████████████████████████████████████████▍                                                      | 163/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.17.self_attn.q_proj.weight]Loading weights:  56%|█████████████████████████████████████████████████████████████████████▉                                                      | 164/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.17.self_attn.v_proj.weight]Loading weights:  56%|█████████████████████████████████████████████████████████████████████▉                                                      | 164/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.17.self_attn.v_proj.weight]Loading weights:  57%|██████████████████████████████████████████████████████████████████████▉                                                      | 165/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.18.input_layernorm.weight]Loading weights:  57%|██████████████████████████████████████████████████████████████████████▉                                                      | 165/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.18.input_layernorm.weight]Loading weights:  57%|████████████████████████████████████████████████████████████████████████▍                                                      | 166/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.18.mlp.down_proj.weight]Loading weights:  57%|████████████████████████████████████████████████████████████████████████▍                                                      | 166/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.18.mlp.down_proj.weight]Loading weights:  57%|████████████████████████████████████████████████████████████████████████▉                                                      | 167/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.18.mlp.gate_proj.weight]Loading weights:  57%|████████████████████████████████████████████████████████████████████████▉                                                      | 167/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.18.mlp.gate_proj.weight]Loading weights:  58%|██████████████████████████████████████████████████████████████████████████▍                                                      | 168/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.18.mlp.up_proj.weight]Loading weights:  58%|██████████████████████████████████████████████████████████████████████████▍                                                      | 168/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.18.mlp.up_proj.weight]Loading weights:  58%|███████████████████████████████████████████████████████████████████▎                                                | 169/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.18.post_attention_layernorm.weight]Loading weights:  58%|███████████████████████████████████████████████████████████████████▎                                                | 169/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.18.post_attention_layernorm.weight]Loading weights:  58%|████████████████████████████████████████████████████████████████████████▍                                                   | 170/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.18.self_attn.k_proj.weight]Loading weights:  58%|████████████████████████████████████████████████████████████████████████▍                                                   | 170/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.18.self_attn.k_proj.weight]Loading weights:  59%|████████████████████████████████████████████████████████████████████████▊                                                   | 171/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.18.self_attn.o_proj.weight]Loading weights:  59%|████████████████████████████████████████████████████████████████████████▊                                                   | 171/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.18.self_attn.o_proj.weight]Loading weights:  59%|█████████████████████████████████████████████████████████████████████████▎                                                  | 172/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.18.self_attn.q_proj.weight]Loading weights:  59%|█████████████████████████████████████████████████████████████████████████▎                                                  | 172/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.18.self_attn.q_proj.weight]Loading weights:  59%|█████████████████████████████████████████████████████████████████████████▋                                                  | 173/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.18.self_attn.v_proj.weight]Loading weights:  59%|█████████████████████████████████████████████████████████████████████████▋                                                  | 173/291 [00:00<00:00, 267.31it/s, Materializing param=model.layers.18.self_attn.v_proj.weight]Loading weights:  60%|██████████████████████████████████████████████████████████████████████████▏                                                 | 174/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.18.self_attn.v_proj.weight]Loading weights:  60%|██████████████████████████████████████████████████████████████████████████▋                                                  | 174/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.19.input_layernorm.weight]Loading weights:  60%|██████████████████████████████████████████████████████████████████████████▋                                                  | 174/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.19.input_layernorm.weight]Loading weights:  60%|████████████████████████████████████████████████████████████████████████████▎                                                  | 175/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.19.mlp.down_proj.weight]Loading weights:  60%|████████████████████████████████████████████████████████████████████████████▎                                                  | 175/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.19.mlp.down_proj.weight]Loading weights:  60%|████████████████████████████████████████████████████████████████████████████▊                                                  | 176/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.19.mlp.gate_proj.weight]Loading weights:  60%|████████████████████████████████████████████████████████████████████████████▊                                                  | 176/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.19.mlp.gate_proj.weight]Loading weights:  61%|██████████████████████████████████████████████████████████████████████████████▍                                                  | 177/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.19.mlp.up_proj.weight]Loading weights:  61%|██████████████████████████████████████████████████████████████████████████████▍                                                  | 177/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.19.mlp.up_proj.weight]Loading weights:  61%|██████████████████████████████████████████████████████████████████████▉                                             | 178/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.19.post_attention_layernorm.weight]Loading weights:  61%|██████████████████████████████████████████████████████████████████████▉                                             | 178/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.19.post_attention_layernorm.weight]Loading weights:  62%|████████████████████████████████████████████████████████████████████████████▎                                               | 179/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.19.self_attn.k_proj.weight]Loading weights:  62%|████████████████████████████████████████████████████████████████████████████▎                                               | 179/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.19.self_attn.k_proj.weight]Loading weights:  62%|████████████████████████████████████████████████████████████████████████████▋                                               | 180/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.19.self_attn.o_proj.weight]Loading weights:  62%|████████████████████████████████████████████████████████████████████████████▋                                               | 180/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.19.self_attn.o_proj.weight]Loading weights:  62%|█████████████████████████████████████████████████████████████████████████████▏                                              | 181/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.19.self_attn.q_proj.weight]Loading weights:  62%|█████████████████████████████████████████████████████████████████████████████▏                                              | 181/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.19.self_attn.q_proj.weight]Loading weights:  63%|█████████████████████████████████████████████████████████████████████████████▌                                              | 182/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.19.self_attn.v_proj.weight]Loading weights:  63%|█████████████████████████████████████████████████████████████████████████████▌                                              | 182/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.19.self_attn.v_proj.weight]Loading weights:  63%|██████████████████████████████████████████████████████████████████████████████▌                                              | 183/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.20.input_layernorm.weight]Loading weights:  63%|██████████████████████████████████████████████████████████████████████████████▌                                              | 183/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.20.input_layernorm.weight]Loading weights:  63%|████████████████████████████████████████████████████████████████████████████████▎                                              | 184/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.20.mlp.down_proj.weight]Loading weights:  63%|████████████████████████████████████████████████████████████████████████████████▎                                              | 184/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.20.mlp.down_proj.weight]Loading weights:  64%|████████████████████████████████████████████████████████████████████████████████▋                                              | 185/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.20.mlp.gate_proj.weight]Loading weights:  64%|████████████████████████████████████████████████████████████████████████████████▋                                              | 185/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.20.mlp.gate_proj.weight]Loading weights:  64%|██████████████████████████████████████████████████████████████████████████████████▍                                              | 186/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.20.mlp.up_proj.weight]Loading weights:  64%|██████████████████████████████████████████████████████████████████████████████████▍                                              | 186/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.20.mlp.up_proj.weight]Loading weights:  64%|██████████████████████████████████████████████████████████████████████████▌                                         | 187/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.20.post_attention_layernorm.weight]Loading weights:  64%|██████████████████████████████████████████████████████████████████████████▌                                         | 187/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.20.post_attention_layernorm.weight]Loading weights:  65%|████████████████████████████████████████████████████████████████████████████████                                            | 188/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.20.self_attn.k_proj.weight]Loading weights:  65%|████████████████████████████████████████████████████████████████████████████████                                            | 188/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.20.self_attn.k_proj.weight]Loading weights:  65%|████████████████████████████████████████████████████████████████████████████████▌                                           | 189/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.20.self_attn.o_proj.weight]Loading weights:  65%|████████████████████████████████████████████████████████████████████████████████▌                                           | 189/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.20.self_attn.o_proj.weight]Loading weights:  65%|████████████████████████████████████████████████████████████████████████████████▉                                           | 190/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.20.self_attn.q_proj.weight]Loading weights:  65%|████████████████████████████████████████████████████████████████████████████████▉                                           | 190/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.20.self_attn.q_proj.weight]Loading weights:  66%|█████████████████████████████████████████████████████████████████████████████████▍                                          | 191/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.20.self_attn.v_proj.weight]Loading weights:  66%|█████████████████████████████████████████████████████████████████████████████████▍                                          | 191/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.20.self_attn.v_proj.weight]Loading weights:  66%|██████████████████████████████████████████████████████████████████████████████████▍                                          | 192/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.21.input_layernorm.weight]Loading weights:  66%|██████████████████████████████████████████████████████████████████████████████████▍                                          | 192/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.21.input_layernorm.weight]Loading weights:  66%|████████████████████████████████████████████████████████████████████████████████████▏                                          | 193/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.21.mlp.down_proj.weight]Loading weights:  66%|████████████████████████████████████████████████████████████████████████████████████▏                                          | 193/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.21.mlp.down_proj.weight]Loading weights:  67%|████████████████████████████████████████████████████████████████████████████████████▋                                          | 194/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.21.mlp.gate_proj.weight]Loading weights:  67%|████████████████████████████████████████████████████████████████████████████████████▋                                          | 194/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.21.mlp.gate_proj.weight]Loading weights:  67%|██████████████████████████████████████████████████████████████████████████████████████▍                                          | 195/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.21.mlp.up_proj.weight]Loading weights:  67%|██████████████████████████████████████████████████████████████████████████████████████▍                                          | 195/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.21.mlp.up_proj.weight]Loading weights:  67%|██████████████████████████████████████████████████████████████████████████████▏                                     | 196/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.21.post_attention_layernorm.weight]Loading weights:  67%|██████████████████████████████████████████████████████████████████████████████▏                                     | 196/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.21.post_attention_layernorm.weight]Loading weights:  68%|███████████████████████████████████████████████████████████████████████████████████▉                                        | 197/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.21.self_attn.k_proj.weight]Loading weights:  68%|███████████████████████████████████████████████████████████████████████████████████▉                                        | 197/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.21.self_attn.k_proj.weight]Loading weights:  68%|████████████████████████████████████████████████████████████████████████████████████▎                                       | 198/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.21.self_attn.o_proj.weight]Loading weights:  68%|████████████████████████████████████████████████████████████████████████████████████▎                                       | 198/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.21.self_attn.o_proj.weight]Loading weights:  68%|████████████████████████████████████████████████████████████████████████████████████▊                                       | 199/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.21.self_attn.q_proj.weight]Loading weights:  68%|████████████████████████████████████████████████████████████████████████████████████▊                                       | 199/291 [00:00<00:00, 250.12it/s, Materializing param=model.layers.21.self_attn.q_proj.weight]Loading weights:  69%|█████████████████████████████████████████████████████████████████████████████████████▏                                      | 200/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.21.self_attn.q_proj.weight]Loading weights:  69%|█████████████████████████████████████████████████████████████████████████████████████▏                                      | 200/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.21.self_attn.v_proj.weight]Loading weights:  69%|█████████████████████████████████████████████████████████████████████████████████████▏                                      | 200/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.21.self_attn.v_proj.weight]Loading weights:  69%|██████████████████████████████████████████████████████████████████████████████████████▎                                      | 201/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.22.input_layernorm.weight]Loading weights:  69%|██████████████████████████████████████████████████████████████████████████████████████▎                                      | 201/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.22.input_layernorm.weight]Loading weights:  69%|████████████████████████████████████████████████████████████████████████████████████████▏                                      | 202/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.22.mlp.down_proj.weight]Loading weights:  69%|████████████████████████████████████████████████████████████████████████████████████████▏                                      | 202/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.22.mlp.down_proj.weight]Loading weights:  70%|████████████████████████████████████████████████████████████████████████████████████████▌                                      | 203/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.22.mlp.gate_proj.weight]Loading weights:  70%|████████████████████████████████████████████████████████████████████████████████████████▌                                      | 203/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.22.mlp.gate_proj.weight]Loading weights:  70%|██████████████████████████████████████████████████████████████████████████████████████████▍                                      | 204/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.22.mlp.up_proj.weight]Loading weights:  70%|██████████████████████████████████████████████████████████████████████████████████████████▍                                      | 204/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.22.mlp.up_proj.weight]Loading weights:  70%|█████████████████████████████████████████████████████████████████████████████████▋                                  | 205/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.22.post_attention_layernorm.weight]Loading weights:  70%|█████████████████████████████████████████████████████████████████████████████████▋                                  | 205/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.22.post_attention_layernorm.weight]Loading weights:  71%|███████████████████████████████████████████████████████████████████████████████████████▊                                    | 206/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.22.self_attn.k_proj.weight]Loading weights:  71%|███████████████████████████████████████████████████████████████████████████████████████▊                                    | 206/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.22.self_attn.k_proj.weight]Loading weights:  71%|████████████████████████████████████████████████████████████████████████████████████████▏                                   | 207/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.22.self_attn.o_proj.weight]Loading weights:  71%|████████████████████████████████████████████████████████████████████████████████████████▏                                   | 207/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.22.self_attn.o_proj.weight]Loading weights:  71%|████████████████████████████████████████████████████████████████████████████████████████▋                                   | 208/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.22.self_attn.q_proj.weight]Loading weights:  71%|████████████████████████████████████████████████████████████████████████████████████████▋                                   | 208/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.22.self_attn.q_proj.weight]Loading weights:  72%|█████████████████████████████████████████████████████████████████████████████████████████                                   | 209/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.22.self_attn.v_proj.weight]Loading weights:  72%|█████████████████████████████████████████████████████████████████████████████████████████                                   | 209/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.22.self_attn.v_proj.weight]Loading weights:  72%|██████████████████████████████████████████████████████████████████████████████████████████▏                                  | 210/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.23.input_layernorm.weight]Loading weights:  72%|██████████████████████████████████████████████████████████████████████████████████████████▏                                  | 210/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.23.input_layernorm.weight]Loading weights:  73%|████████████████████████████████████████████████████████████████████████████████████████████                                   | 211/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.23.mlp.down_proj.weight]Loading weights:  73%|████████████████████████████████████████████████████████████████████████████████████████████                                   | 211/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.23.mlp.down_proj.weight]Loading weights:  73%|████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 212/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.23.mlp.gate_proj.weight]Loading weights:  73%|████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 212/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.23.mlp.gate_proj.weight]Loading weights:  73%|██████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 213/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.23.mlp.up_proj.weight]Loading weights:  73%|██████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 213/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.23.mlp.up_proj.weight]Loading weights:  74%|█████████████████████████████████████████████████████████████████████████████████████▎                              | 214/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.23.post_attention_layernorm.weight]Loading weights:  74%|█████████████████████████████████████████████████████████████████████████████████████▎                              | 214/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.23.post_attention_layernorm.weight]Loading weights:  74%|███████████████████████████████████████████████████████████████████████████████████████████▌                                | 215/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.23.self_attn.k_proj.weight]Loading weights:  74%|███████████████████████████████████████████████████████████████████████████████████████████▌                                | 215/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.23.self_attn.k_proj.weight]Loading weights:  74%|████████████████████████████████████████████████████████████████████████████████████████████                                | 216/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.23.self_attn.o_proj.weight]Loading weights:  74%|████████████████████████████████████████████████████████████████████████████████████████████                                | 216/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.23.self_attn.o_proj.weight]Loading weights:  75%|████████████████████████████████████████████████████████████████████████████████████████████▍                               | 217/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.23.self_attn.q_proj.weight]Loading weights:  75%|████████████████████████████████████████████████████████████████████████████████████████████▍                               | 217/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.23.self_attn.q_proj.weight]Loading weights:  75%|████████████████████████████████████████████████████████████████████████████████████████████▉                               | 218/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.23.self_attn.v_proj.weight]Loading weights:  75%|████████████████████████████████████████████████████████████████████████████████████████████▉                               | 218/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.23.self_attn.v_proj.weight]Loading weights:  75%|██████████████████████████████████████████████████████████████████████████████████████████████                               | 219/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.24.input_layernorm.weight]Loading weights:  75%|██████████████████████████████████████████████████████████████████████████████████████████████                               | 219/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.24.input_layernorm.weight]Loading weights:  76%|████████████████████████████████████████████████████████████████████████████████████████████████                               | 220/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.24.mlp.down_proj.weight]Loading weights:  76%|████████████████████████████████████████████████████████████████████████████████████████████████                               | 220/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.24.mlp.down_proj.weight]Loading weights:  76%|████████████████████████████████████████████████████████████████████████████████████████████████▍                              | 221/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.24.mlp.gate_proj.weight]Loading weights:  76%|████████████████████████████████████████████████████████████████████████████████████████████████▍                              | 221/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.24.mlp.gate_proj.weight]Loading weights:  76%|██████████████████████████████████████████████████████████████████████████████████████████████████▍                              | 222/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.24.mlp.up_proj.weight]Loading weights:  76%|██████████████████████████████████████████████████████████████████████████████████████████████████▍                              | 222/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.24.mlp.up_proj.weight]Loading weights:  77%|████████████████████████████████████████████████████████████████████████████████████████▉                           | 223/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.24.post_attention_layernorm.weight]Loading weights:  77%|████████████████████████████████████████████████████████████████████████████████████████▉                           | 223/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.24.post_attention_layernorm.weight]Loading weights:  77%|███████████████████████████████████████████████████████████████████████████████████████████████▍                            | 224/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.24.self_attn.k_proj.weight]Loading weights:  77%|███████████████████████████████████████████████████████████████████████████████████████████████▍                            | 224/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.24.self_attn.k_proj.weight]Loading weights:  77%|███████████████████████████████████████████████████████████████████████████████████████████████▉                            | 225/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.24.self_attn.o_proj.weight]Loading weights:  77%|███████████████████████████████████████████████████████████████████████████████████████████████▉                            | 225/291 [00:00<00:00, 241.64it/s, Materializing param=model.layers.24.self_attn.o_proj.weight]Loading weights:  78%|████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 226/291 [00:00<00:00, 246.74it/s, Materializing param=model.layers.24.self_attn.o_proj.weight]Loading weights:  78%|████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 226/291 [00:00<00:00, 246.74it/s, Materializing param=model.layers.24.self_attn.q_proj.weight]Loading weights:  78%|████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 226/291 [00:00<00:00, 246.74it/s, Materializing param=model.layers.24.self_attn.q_proj.weight]Loading weights:  78%|████████████████████████████████████████████████████████████████████████████████████████████████▋                           | 227/291 [00:00<00:00, 246.74it/s, Materializing param=model.layers.24.self_attn.v_proj.weight]Loading weights:  78%|████████████████████████████████████████████████████████████████████████████████████████████████▋                           | 227/291 [00:00<00:00, 246.74it/s, Materializing param=model.layers.24.self_attn.v_proj.weight]Loading weights:  78%|█████████████████████████████████████████████████████████████████████████████████████████████████▉                           | 228/291 [00:00<00:00, 246.74it/s, Materializing param=model.layers.25.input_layernorm.weight]Loading weights:  78%|█████████████████████████████████████████████████████████████████████████████████████████████████▉                           | 228/291 [00:00<00:00, 246.74it/s, Materializing param=model.layers.25.input_layernorm.weight]Loading weights:  79%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                           | 229/291 [00:00<00:00, 246.74it/s, Materializing param=model.layers.25.mlp.down_proj.weight]Loading weights:  79%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                           | 229/291 [00:00<00:00, 246.74it/s, Materializing param=model.layers.25.mlp.down_proj.weight]Loading weights:  79%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                          | 230/291 [00:00<00:00, 246.74it/s, Materializing param=model.layers.25.mlp.gate_proj.weight]Loading weights:  79%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                          | 230/291 [00:00<00:00, 246.74it/s, Materializing param=model.layers.25.mlp.gate_proj.weight]Loading weights:  79%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍                          | 231/291 [00:00<00:00, 246.74it/s, Materializing param=model.layers.25.mlp.up_proj.weight]Loading weights:  79%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍                          | 231/291 [00:00<00:00, 246.74it/s, Materializing param=model.layers.25.mlp.up_proj.weight]Loading weights:  80%|████████████████████████████████████████████████████████████████████████████████████████████▍                       | 232/291 [00:00<00:00, 246.74it/s, Materializing param=model.layers.25.post_attention_layernorm.weight]Loading weights:  80%|████████████████████████████████████████████████████████████████████████████████████████████▍                       | 232/291 [00:00<00:00, 246.74it/s, Materializing param=model.layers.25.post_attention_layernorm.weight]Loading weights:  80%|███████████████████████████████████████████████████████████████████████████████████████████████████▎                        | 233/291 [00:00<00:00, 246.74it/s, Materializing param=model.layers.25.self_attn.k_proj.weight]Loading weights:  80%|███████████████████████████████████████████████████████████████████████████████████████████████████▎                        | 233/291 [00:00<00:00, 246.74it/s, Materializing param=model.layers.25.self_attn.k_proj.weight]Loading weights:  80%|███████████████████████████████████████████████████████████████████████████████████████████████████▋                        | 234/291 [00:00<00:00, 246.74it/s, Materializing param=model.layers.25.self_attn.o_proj.weight]Loading weights:  80%|███████████████████████████████████████████████████████████████████████████████████████████████████▋                        | 234/291 [00:00<00:00, 246.74it/s, Materializing param=model.layers.25.self_attn.o_proj.weight]Loading weights:  81%|████████████████████████████████████████████████████████████████████████████████████████████████████▏                       | 235/291 [00:00<00:00, 246.74it/s, Materializing param=model.layers.25.self_attn.q_proj.weight]Loading weights:  81%|████████████████████████████████████████████████████████████████████████████████████████████████████▏                       | 235/291 [00:00<00:00, 246.74it/s, Materializing param=model.layers.25.self_attn.q_proj.weight]Loading weights:  81%|████████████████████████████████████████████████████████████████████████████████████████████████████▌                       | 236/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.25.self_attn.v_proj.weight]Loading weights:  81%|████████████████████████████████████████████████████████████████████████████████████████████████████▌                       | 236/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.25.self_attn.v_proj.weight]Loading weights:  81%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 237/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.26.input_layernorm.weight]Loading weights:  81%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 237/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.26.input_layernorm.weight]Loading weights:  82%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 238/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.26.mlp.down_proj.weight]Loading weights:  82%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 238/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.26.mlp.down_proj.weight]Loading weights:  82%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                      | 239/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.26.mlp.gate_proj.weight]Loading weights:  82%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                      | 239/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.26.mlp.gate_proj.weight]Loading weights:  82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍                      | 240/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.26.mlp.up_proj.weight]Loading weights:  82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍                      | 240/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.26.mlp.up_proj.weight]Loading weights:  83%|████████████████████████████████████████████████████████████████████████████████████████████████                    | 241/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.26.post_attention_layernorm.weight]Loading weights:  83%|████████████████████████████████████████████████████████████████████████████████████████████████                    | 241/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.26.post_attention_layernorm.weight]Loading weights:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████████                     | 242/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.26.self_attn.k_proj.weight]Loading weights:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████████                     | 242/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.26.self_attn.k_proj.weight]Loading weights:  84%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                    | 243/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.26.self_attn.o_proj.weight]Loading weights:  84%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                    | 243/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.26.self_attn.o_proj.weight]Loading weights:  84%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉                    | 244/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.26.self_attn.q_proj.weight]Loading weights:  84%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉                    | 244/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.26.self_attn.q_proj.weight]Loading weights:  84%|████████████████████████████████████████████████████████████████████████████████████████████████████████▍                   | 245/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.26.self_attn.v_proj.weight]Loading weights:  84%|████████████████████████████████████████████████████████████████████████████████████████████████████████▍                   | 245/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.26.self_attn.v_proj.weight]Loading weights:  85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 246/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.27.input_layernorm.weight]Loading weights:  85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 246/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.27.input_layernorm.weight]Loading weights:  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                   | 247/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.27.mlp.down_proj.weight]Loading weights:  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                   | 247/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.27.mlp.down_proj.weight]Loading weights:  85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                  | 248/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.27.mlp.gate_proj.weight]Loading weights:  85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                  | 248/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.27.mlp.gate_proj.weight]Loading weights:  86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 249/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.27.mlp.up_proj.weight]Loading weights:  86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 249/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.27.mlp.up_proj.weight]Loading weights:  86%|███████████████████████████████████████████████████████████████████████████████████████████████████▋                | 250/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.27.post_attention_layernorm.weight]Loading weights:  86%|███████████████████████████████████████████████████████████████████████████████████████████████████▋                | 250/291 [00:01<00:00, 246.74it/s, Materializing param=model.layers.27.post_attention_layernorm.weight]Loading weights:  86%|████████████████████████████████████████████████████████████████████████████████████████████████████                | 251/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.27.post_attention_layernorm.weight]Loading weights:  86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 251/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.27.self_attn.k_proj.weight]Loading weights:  86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 251/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.27.self_attn.k_proj.weight]Loading weights:  87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 252/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.27.self_attn.o_proj.weight]Loading weights:  87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 252/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.27.self_attn.o_proj.weight]Loading weights:  87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 253/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.27.self_attn.q_proj.weight]Loading weights:  87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 253/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.27.self_attn.q_proj.weight]Loading weights:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏               | 254/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.27.self_attn.v_proj.weight]Loading weights:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏               | 254/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.27.self_attn.v_proj.weight]Loading weights:  88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 255/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.28.input_layernorm.weight]Loading weights:  88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 255/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.28.input_layernorm.weight]Loading weights:  88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋               | 256/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.28.mlp.down_proj.weight]Loading weights:  88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋               | 256/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.28.mlp.down_proj.weight]Loading weights:  88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏              | 257/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.28.mlp.gate_proj.weight]Loading weights:  88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏              | 257/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.28.mlp.gate_proj.weight]Loading weights:  89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 258/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.28.mlp.up_proj.weight]Loading weights:  89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 258/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.28.mlp.up_proj.weight]Loading weights:  89%|███████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 259/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.28.post_attention_layernorm.weight]Loading weights:  89%|███████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 259/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.28.post_attention_layernorm.weight]Loading weights:  89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 260/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.28.self_attn.k_proj.weight]Loading weights:  89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 260/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.28.self_attn.k_proj.weight]Loading weights:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 261/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.28.self_attn.o_proj.weight]Loading weights:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 261/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.28.self_attn.o_proj.weight]Loading weights:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 262/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.28.self_attn.q_proj.weight]Loading weights:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 262/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.28.self_attn.q_proj.weight]Loading weights:  90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████            | 263/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.28.self_attn.v_proj.weight]Loading weights:  90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████            | 263/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.28.self_attn.v_proj.weight]Loading weights:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 264/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.29.input_layernorm.weight]Loading weights:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 264/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.29.input_layernorm.weight]Loading weights:  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋           | 265/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.29.mlp.down_proj.weight]Loading weights:  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋           | 265/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.29.mlp.down_proj.weight]Loading weights:  91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████           | 266/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.29.mlp.gate_proj.weight]Loading weights:  91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████           | 266/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.29.mlp.gate_proj.weight]Loading weights:  92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 267/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.29.mlp.up_proj.weight]Loading weights:  92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 267/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.29.mlp.up_proj.weight]Loading weights:  92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 268/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.29.post_attention_layernorm.weight]Loading weights:  92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 268/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.29.post_attention_layernorm.weight]Loading weights:  92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 269/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.29.self_attn.k_proj.weight]Loading weights:  92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 269/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.29.self_attn.k_proj.weight]Loading weights:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████         | 270/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.29.self_attn.o_proj.weight]Loading weights:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████         | 270/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.29.self_attn.o_proj.weight]Loading weights:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 271/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.29.self_attn.q_proj.weight]Loading weights:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 271/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.29.self_attn.q_proj.weight]Loading weights:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 272/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.29.self_attn.v_proj.weight]Loading weights:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 272/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.29.self_attn.v_proj.weight]Loading weights:  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 273/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.30.input_layernorm.weight]Loading weights:  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 273/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.30.input_layernorm.weight]Loading weights:  94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 274/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.30.mlp.down_proj.weight]Loading weights:  94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 274/291 [00:01<00:00, 228.90it/s, Materializing param=model.layers.30.mlp.down_proj.weight]Loading weights:  95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████       | 275/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.30.mlp.down_proj.weight]Loading weights:  95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████       | 275/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.30.mlp.gate_proj.weight]Loading weights:  95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████       | 275/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.30.mlp.gate_proj.weight]Loading weights:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 276/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.30.mlp.up_proj.weight]Loading weights:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 276/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.30.mlp.up_proj.weight]Loading weights:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 277/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.30.post_attention_layernorm.weight]Loading weights:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 277/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.30.post_attention_layernorm.weight]Loading weights:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 278/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.30.self_attn.k_proj.weight]Loading weights:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 278/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.30.self_attn.k_proj.weight]Loading weights:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 279/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.30.self_attn.o_proj.weight]Loading weights:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 279/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.30.self_attn.o_proj.weight]Loading weights:  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 280/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.30.self_attn.q_proj.weight]Loading weights:  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 280/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.30.self_attn.q_proj.weight]Loading weights:  97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 281/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.30.self_attn.v_proj.weight]Loading weights:  97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 281/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.30.self_attn.v_proj.weight]Loading weights:  97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏   | 282/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.31.input_layernorm.weight]Loading weights:  97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏   | 282/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.31.input_layernorm.weight]Loading weights:  97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 283/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.31.mlp.down_proj.weight]Loading weights:  97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 283/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.31.mlp.down_proj.weight]Loading weights:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉   | 284/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.31.mlp.gate_proj.weight]Loading weights:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉   | 284/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.31.mlp.gate_proj.weight]Loading weights:  98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎  | 285/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.31.mlp.up_proj.weight]Loading weights:  98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎  | 285/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.31.mlp.up_proj.weight]Loading weights:  98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████  | 286/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.31.post_attention_layernorm.weight]Loading weights:  98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████  | 286/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.31.post_attention_layernorm.weight]Loading weights:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 287/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.31.self_attn.k_proj.weight]Loading weights:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 287/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.31.self_attn.k_proj.weight]Loading weights:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 288/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.31.self_attn.o_proj.weight]Loading weights:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 288/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.31.self_attn.o_proj.weight]Loading weights:  99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 289/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.31.self_attn.q_proj.weight]Loading weights:  99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 289/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.31.self_attn.q_proj.weight]Loading weights: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 290/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.31.self_attn.v_proj.weight]Loading weights: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 290/291 [00:01<00:00, 197.10it/s, Materializing param=model.layers.31.self_attn.v_proj.weight]Loading weights: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 291/291 [00:01<00:00, 197.10it/s, Materializing param=model.norm.weight]Loading weights: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 291/291 [00:01<00:00, 197.10it/s, Materializing param=model.norm.weight]Loading weights: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 291/291 [00:01<00:00, 217.73it/s, Materializing param=model.norm.weight]
[2026-02-12 03:39:52,693] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:356] [PID:9815] Converting modules to torch.bfloat16
[2026-02-12 03:39:53,020] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:9815] Memory usage after model load 0.000GB ()
[2026-02-12 03:39:53,026] [INFO] [axolotl.loaders.adapter.load_lora:81] [PID:9815] found linear modules: ['down_proj', 'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj']
[2026-02-12 03:39:53,048] [WARNING] [torchao.<module>:39] [PID:9815] Skipping import of cpp extensions due to incompatible torch version 2.9.1+cu128 for torchao version 0.13.0
trainable params: 335,544,320 || all params: 7,577,276,416 || trainable%: 4.4283
[2026-02-12 03:39:55,819] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:9815] after adapters 0.000GB ()
[2026-02-12 03:40:04,452] [INFO] [axolotl.train.save_initial_configs:402] [PID:9815] Pre-saving adapter config to ./finetune-model-output...
[2026-02-12 03:40:04,485] [INFO] [axolotl.train.save_initial_configs:406] [PID:9815] Pre-saving tokenizer to ./finetune-model-output...
[2026-02-12 03:40:04,690] [INFO] [axolotl.train.save_initial_configs:411] [PID:9815] Pre-saving model config to ./finetune-model-output...
[2026-02-12 03:40:04,766] [INFO] [axolotl.train.execute_training:207] [PID:9815] Starting trainer...
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.
[34m[1mwandb[0m: Currently logged in as: [33mevanpeterarmstrong[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: [38;5;178m⢿[0m Waiting for wandb.init()...
[Am[2K[34m[1mwandb[0m: [38;5;178m⣻[0m Waiting for wandb.init()...
[Am[2K[34m[1mwandb[0m: Tracking run with wandb version 0.24.2
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/workspace/data/wandb/run-20260212_034005-itfgfa1x[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mpeach-forest-27[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/evanpeterarmstrong/diplonations[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/evanpeterarmstrong/diplonations/runs/itfgfa1x[0m
[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/
[34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
[34m[1mwandb[0m: [33mWARNING[0m Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.
[2026-02-12 03:40:07,637] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:9815] The Axolotl config has been saved to the WandB run under files.
  0%|                                                                                                                                                                                                                     | 0/656 [00:00<?, ?it/s]  0%|▎                                                                                                                                                                                                        | 1/656 [03:03<33:25:07, 183.68s/it]                                                                                                                                                                                                                                                  {'loss': '1.421', 'grad_norm': '5.234', 'learning_rate': '0.0001', 'ppl': '4.14', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.06', 'tokens/train_per_sec_per_gpu': '44.82', 'tokens/total': 1320960, 'tokens/trainable': 163626, 'epoch': '0.003052'}
  0%|▎                                                                                                                                                                                                        | 1/656 [03:03<33:25:07, 183.68s/it]  0%|▌                                                                                                                                                                                                        | 2/656 [05:55<32:03:21, 176.45s/it]                                                                                                                                                                                                                                                  {'loss': '2.761', 'grad_norm': '77.52', 'learning_rate': '0.0001', 'ppl': '15.82', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '104.4', 'tokens/total': 2641920, 'tokens/trainable': 354390, 'epoch': '0.006103'}
  0%|▌                                                                                                                                                                                                        | 2/656 [05:55<32:03:21, 176.45s/it]  0%|▉                                                                                                                                                                                                        | 3/656 [08:47<31:40:18, 174.61s/it]                                                                                                                                                                                                                                                  {'loss': '1.871', 'grad_norm': '64.34', 'learning_rate': '0.0001', 'ppl': '6.498', 'memory/max_active (GiB)': '54.92', 'memory/max_allocated (GiB)': '54.92', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '32.32', 'tokens/total': 3962880, 'tokens/trainable': 521207, 'epoch': '0.009155'}
  0%|▉                                                                                                                                                                                                        | 3/656 [08:47<31:40:18, 174.61s/it]  1%|█▏                                                                                                                                                                                                       | 4/656 [11:38<31:22:26, 173.23s/it]                                                                                                                                                                                                                                                  {'loss': '11.91', 'grad_norm': '1034', 'learning_rate': '0.0001', 'ppl': '1.494e+05', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.84', 'tokens/total': 5283840, 'tokens/trainable': 679562, 'epoch': '0.01221'}
  1%|█▏                                                                                                                                                                                                       | 4/656 [11:38<31:22:26, 173.23s/it]  1%|█▌                                                                                                                                                                                                       | 5/656 [14:27<31:03:46, 171.78s/it]                                                                                                                                                                                                                                                  {'loss': '2.833', 'grad_norm': '187.5', 'learning_rate': '0.0001', 'ppl': '17', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '53.75', 'tokens/total': 6604800, 'tokens/trainable': 827137, 'epoch': '0.01526'}
  1%|█▌                                                                                                                                                                                                       | 5/656 [14:27<31:03:46, 171.78s/it]  1%|█▊                                                                                                                                                                                                       | 6/656 [17:19<31:00:56, 171.78s/it]                                                                                                                                                                                                                                                  {'loss': '1.706', 'grad_norm': '14.29', 'learning_rate': '0.0001', 'ppl': '5.504', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.28', 'tokens/total': 7925760, 'tokens/trainable': 985165, 'epoch': '0.01831'}
  1%|█▊                                                                                                                                                                                                       | 6/656 [17:19<31:00:56, 171.78s/it]  1%|██▏                                                                                                                                                                                                      | 7/656 [20:11<30:57:58, 171.77s/it]                                                                                                                                                                                                                                                  {'loss': '1.408', 'grad_norm': '5.029', 'learning_rate': '0.0001', 'ppl': '4.088', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '45.27', 'tokens/total': 9246720, 'tokens/trainable': 1130559, 'epoch': '0.02136'}
  1%|██▏                                                                                                                                                                                                      | 7/656 [20:11<30:57:58, 171.77s/it]  1%|██▍                                                                                                                                                                                                      | 8/656 [23:02<30:54:07, 171.68s/it]                                                                                                                                                                                                                                                  {'loss': '1.343', 'grad_norm': '7.892', 'learning_rate': '0.0001', 'ppl': '3.832', 'memory/max_active (GiB)': '54.91', 'memory/max_allocated (GiB)': '54.91', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.82', 'tokens/total': 10567680, 'tokens/trainable': 1292040, 'epoch': '0.02441'}
  1%|██▍                                                                                                                                                                                                      | 8/656 [23:02<30:54:07, 171.68s/it]  1%|██▊                                                                                                                                                                                                      | 9/656 [25:54<30:51:59, 171.74s/it]                                                                                                                                                                                                                                                  {'loss': '1.202', 'grad_norm': '3.608', 'learning_rate': '0.0001', 'ppl': '3.326', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '89.25', 'tokens/total': 11888640, 'tokens/trainable': 1457156, 'epoch': '0.02746'}
  1%|██▊                                                                                                                                                                                                      | 9/656 [25:54<30:51:59, 171.74s/it]  2%|███                                                                                                                                                                                                     | 10/656 [28:45<30:46:09, 171.47s/it]                                                                                                                                                                                                                                                  {'loss': '1.205', 'grad_norm': '4.268', 'learning_rate': '0.0001', 'ppl': '3.338', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '23.72', 'tokens/total': 13209600, 'tokens/trainable': 1613944, 'epoch': '0.03052'}
  2%|███                                                                                                                                                                                                     | 10/656 [28:45<30:46:09, 171.47s/it]  2%|███▎                                                                                                                                                                                                    | 11/656 [31:36<30:41:10, 171.27s/it]                                                                                                                                                                                                                                                  {'loss': '1.128', 'grad_norm': '2.842', 'learning_rate': '0.0001', 'ppl': '3.09', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '71.99', 'tokens/total': 14530560, 'tokens/trainable': 1776169, 'epoch': '0.03357'}
  2%|███▎                                                                                                                                                                                                    | 11/656 [31:36<30:41:10, 171.27s/it]  2%|███▋                                                                                                                                                                                                    | 12/656 [34:28<30:39:35, 171.39s/it]                                                                                                                                                                                                                                                  {'loss': '1.123', 'grad_norm': '1.638', 'learning_rate': '0.0001', 'ppl': '3.075', 'memory/max_active (GiB)': '54.94', 'memory/max_allocated (GiB)': '54.94', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '35.8', 'tokens/total': 15851520, 'tokens/trainable': 1937300, 'epoch': '0.03662'}
  2%|███▋                                                                                                                                                                                                    | 12/656 [34:28<30:39:35, 171.39s/it]  2%|███▉                                                                                                                                                                                                    | 13/656 [37:20<30:40:31, 171.74s/it]                                                                                                                                                                                                                                                  {'loss': '1.079', 'grad_norm': '1.644', 'learning_rate': '0.0001', 'ppl': '2.941', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '52.71', 'tokens/total': 17172480, 'tokens/trainable': 2104178, 'epoch': '0.03967'}
  2%|███▉                                                                                                                                                                                                    | 13/656 [37:20<30:40:31, 171.74s/it]  2%|████▎                                                                                                                                                                                                   | 14/656 [40:11<30:34:47, 171.48s/it]                                                                                                                                                                                                                                                  {'loss': '1.09', 'grad_norm': '1.424', 'learning_rate': '0.0001', 'ppl': '2.975', 'memory/max_active (GiB)': '54.91', 'memory/max_allocated (GiB)': '54.91', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.46', 'tokens/total': 18493440, 'tokens/trainable': 2250249, 'epoch': '0.04272'}
  2%|████▎                                                                                                                                                                                                   | 14/656 [40:11<30:34:47, 171.48s/it]  2%|████▌                                                                                                                                                                                                   | 15/656 [43:01<30:28:34, 171.16s/it]                                                                                                                                                                                                                                                  {'loss': '1.025', 'grad_norm': '1.255', 'learning_rate': '0.0001', 'ppl': '2.787', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.22', 'tokens/total': 19814400, 'tokens/trainable': 2428946, 'epoch': '0.04577'}
  2%|████▌                                                                                                                                                                                                   | 15/656 [43:01<30:28:34, 171.16s/it]  2%|████▉                                                                                                                                                                                                   | 16/656 [45:53<30:26:15, 171.21s/it]                                                                                                                                                                                                                                                  {'loss': '1.019', 'grad_norm': '1.497', 'learning_rate': '0.0001', 'ppl': '2.771', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '59.51', 'tokens/total': 21135360, 'tokens/trainable': 2599892, 'epoch': '0.04883'}
  2%|████▉                                                                                                                                                                                                   | 16/656 [45:53<30:26:15, 171.21s/it]  3%|█████▏                                                                                                                                                                                                  | 17/656 [48:44<30:23:53, 171.26s/it]                                                                                                                                                                                                                                                  {'loss': '0.9963', 'grad_norm': '1.149', 'learning_rate': '0.0001', 'ppl': '2.708', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.72', 'tokens/total': 22456320, 'tokens/trainable': 2760477, 'epoch': '0.05188'}
  3%|█████▏                                                                                                                                                                                                  | 17/656 [48:44<30:23:53, 171.26s/it]  3%|█████▍                                                                                                                                                                                                  | 18/656 [51:35<30:18:53, 171.06s/it]                                                                                                                                                                                                                                                  {'loss': '0.9663', 'grad_norm': '1.213', 'learning_rate': '0.0001', 'ppl': '2.628', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '91.22', 'tokens/total': 23777280, 'tokens/trainable': 2924768, 'epoch': '0.05493'}
  3%|█████▍                                                                                                                                                                                                  | 18/656 [51:35<30:18:53, 171.06s/it]  3%|█████▊                                                                                                                                                                                                  | 19/656 [54:23<30:08:31, 170.35s/it]                                                                                                                                                                                                                                                  {'loss': '0.9706', 'grad_norm': '1.119', 'learning_rate': '0.0001', 'ppl': '2.639', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '68.42', 'tokens/total': 25098240, 'tokens/trainable': 3085842, 'epoch': '0.05798'}
  3%|█████▊                                                                                                                                                                                                  | 19/656 [54:23<30:08:31, 170.35s/it]  3%|██████                                                                                                                                                                                                  | 20/656 [57:13<30:04:03, 170.19s/it]                                                                                                                                                                                                                                                  {'loss': '0.9835', 'grad_norm': '1.208', 'learning_rate': '0.0001', 'ppl': '2.674', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '42.87', 'tokens/total': 26419200, 'tokens/trainable': 3253275, 'epoch': '0.06103'}
  3%|██████                                                                                                                                                                                                  | 20/656 [57:13<30:04:03, 170.19s/it]  3%|██████▎                                                                                                                                                                                               | 21/656 [1:00:05<30:06:49, 170.72s/it]                                                                                                                                                                                                                                                  {'loss': '0.9605', 'grad_norm': '1.223', 'learning_rate': '0.0001', 'ppl': '2.613', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '56.21', 'tokens/total': 27740160, 'tokens/trainable': 3412195, 'epoch': '0.06408'}
  3%|██████▎                                                                                                                                                                                               | 21/656 [1:00:05<30:06:49, 170.72s/it]  3%|██████▋                                                                                                                                                                                               | 22/656 [1:02:57<30:07:42, 171.08s/it]                                                                                                                                                                                                                                                  {'loss': '0.981', 'grad_norm': '1.049', 'learning_rate': '0.0001', 'ppl': '2.667', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '28.63', 'tokens/total': 29061120, 'tokens/trainable': 3567178, 'epoch': '0.06713'}
  3%|██████▋                                                                                                                                                                                               | 22/656 [1:02:57<30:07:42, 171.08s/it]  4%|██████▉                                                                                                                                                                                               | 23/656 [1:05:49<30:07:44, 171.35s/it]                                                                                                                                                                                                                                                  {'loss': '0.9269', 'grad_norm': '0.9415', 'learning_rate': '0.0001', 'ppl': '2.527', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '61.31', 'tokens/total': 30382080, 'tokens/trainable': 3744140, 'epoch': '0.07019'}
  4%|██████▉                                                                                                                                                                                               | 23/656 [1:05:49<30:07:44, 171.35s/it]  4%|███████▏                                                                                                                                                                                              | 24/656 [1:08:39<30:00:32, 170.94s/it]                                                                                                                                                                                                                                                  {'loss': '0.9579', 'grad_norm': '1.062', 'learning_rate': '0.0001', 'ppl': '2.606', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.89', 'tokens/total': 31703040, 'tokens/trainable': 3892464, 'epoch': '0.07324'}
  4%|███████▏                                                                                                                                                                                              | 24/656 [1:08:39<30:00:32, 170.94s/it]  4%|███████▌                                                                                                                                                                                              | 25/656 [1:11:33<30:05:41, 171.70s/it]                                                                                                                                                                                                                                                  {'loss': '0.9275', 'grad_norm': '0.9405', 'learning_rate': '0.0001', 'ppl': '2.528', 'memory/max_active (GiB)': '54.92', 'memory/max_allocated (GiB)': '54.92', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.2', 'tokens/total': 33024000, 'tokens/trainable': 4072136, 'epoch': '0.07629'}
  4%|███████▌                                                                                                                                                                                              | 25/656 [1:11:33<30:05:41, 171.70s/it]  4%|███████▊                                                                                                                                                                                              | 26/656 [1:14:21<29:53:59, 170.86s/it]                                                                                                                                                                                                                                                  {'loss': '0.9745', 'grad_norm': '1.036', 'learning_rate': '0.0001', 'ppl': '2.65', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '35.74', 'tokens/total': 34344960, 'tokens/trainable': 4213314, 'epoch': '0.07934'}
  4%|███████▊                                                                                                                                                                                              | 26/656 [1:14:21<29:53:59, 170.86s/it]  4%|████████▏                                                                                                                                                                                             | 27/656 [1:17:13<29:52:34, 170.99s/it]                                                                                                                                                                                                                                                  {'loss': '0.9788', 'grad_norm': '1.007', 'learning_rate': '0.0001', 'ppl': '2.661', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '41.35', 'tokens/total': 35665920, 'tokens/trainable': 4358554, 'epoch': '0.08239'}
  4%|████████▏                                                                                                                                                                                             | 27/656 [1:17:13<29:52:34, 170.99s/it]  4%|████████▍                                                                                                                                                                                             | 28/656 [1:20:04<29:49:30, 170.97s/it]                                                                                                                                                                                                                                                  {'loss': '0.9372', 'grad_norm': '0.8892', 'learning_rate': '0.0001', 'ppl': '2.553', 'memory/max_active (GiB)': '54.97', 'memory/max_allocated (GiB)': '54.97', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '62.65', 'tokens/total': 36986880, 'tokens/trainable': 4536817, 'epoch': '0.08544'}
  4%|████████▍                                                                                                                                                                                             | 28/656 [1:20:04<29:49:30, 170.97s/it]  4%|████████▊                                                                                                                                                                                             | 29/656 [1:22:54<29:45:30, 170.86s/it]                                                                                                                                                                                                                                                  {'loss': '1.017', 'grad_norm': '0.9455', 'learning_rate': '0.0001', 'ppl': '2.766', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '38.37', 'tokens/total': 38307840, 'tokens/trainable': 4701138, 'epoch': '0.0885'}
  4%|████████▊                                                                                                                                                                                             | 29/656 [1:22:54<29:45:30, 170.86s/it]  5%|█████████                                                                                                                                                                                             | 30/656 [1:25:45<29:42:50, 170.88s/it]                                                                                                                                                                                                                                                  {'loss': '0.9389', 'grad_norm': '0.8845', 'learning_rate': '0.0001', 'ppl': '2.557', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '102.1', 'tokens/total': 39628800, 'tokens/trainable': 4861852, 'epoch': '0.09155'}
  5%|█████████                                                                                                                                                                                             | 30/656 [1:25:45<29:42:50, 170.88s/it]  5%|█████████▎                                                                                                                                                                                            | 31/656 [1:28:37<29:43:18, 171.20s/it]                                                                                                                                                                                                                                                  {'loss': '0.8912', 'grad_norm': '0.8822', 'learning_rate': '0.0001', 'ppl': '2.438', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '67.02', 'tokens/total': 40949760, 'tokens/trainable': 5035248, 'epoch': '0.0946'}
  5%|█████████▎                                                                                                                                                                                            | 31/656 [1:28:37<29:43:18, 171.20s/it]  5%|█████████▋                                                                                                                                                                                            | 32/656 [1:31:27<29:37:08, 170.88s/it]                                                                                                                                                                                                                                                  {'loss': '0.9669', 'grad_norm': '0.9228', 'learning_rate': '0.0001', 'ppl': '2.63', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '17.32', 'tokens/total': 42270720, 'tokens/trainable': 5183732, 'epoch': '0.09765'}
  5%|█████████▋                                                                                                                                                                                            | 32/656 [1:31:27<29:37:08, 170.88s/it]  5%|█████████▉                                                                                                                                                                                            | 33/656 [1:34:18<29:33:42, 170.82s/it]                                                                                                                                                                                                                                                  {'loss': '0.9453', 'grad_norm': '0.8585', 'learning_rate': '0.0001', 'ppl': '2.574', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '57.42', 'tokens/total': 43591680, 'tokens/trainable': 5344590, 'epoch': '0.1007'}
  5%|█████████▉                                                                                                                                                                                            | 33/656 [1:34:18<29:33:42, 170.82s/it]  5%|██████████▎                                                                                                                                                                                           | 34/656 [1:37:09<29:31:17, 170.86s/it]                                                                                                                                                                                                                                                  {'loss': '0.8825', 'grad_norm': '0.8478', 'learning_rate': '0.0001', 'ppl': '2.417', 'memory/max_active (GiB)': '54.91', 'memory/max_allocated (GiB)': '54.91', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '84.16', 'tokens/total': 44912640, 'tokens/trainable': 5526509, 'epoch': '0.1038'}
  5%|██████████▎                                                                                                                                                                                           | 34/656 [1:37:09<29:31:17, 170.86s/it]  5%|██████████▌                                                                                                                                                                                           | 35/656 [1:40:00<29:28:07, 170.83s/it]                                                                                                                                                                                                                                                  {'loss': '0.918', 'grad_norm': '0.8547', 'learning_rate': '0.0001', 'ppl': '2.504', 'memory/max_active (GiB)': '54.78', 'memory/max_allocated (GiB)': '54.78', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '53.96', 'tokens/total': 46233600, 'tokens/trainable': 5689241, 'epoch': '0.1068'}
  5%|██████████▌                                                                                                                                                                                           | 35/656 [1:40:00<29:28:07, 170.83s/it]  5%|██████████▊                                                                                                                                                                                           | 36/656 [1:42:50<29:23:24, 170.65s/it]                                                                                                                                                                                                                                                  {'loss': '0.8952', 'grad_norm': '0.8629', 'learning_rate': '0.0001', 'ppl': '2.448', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '39.87', 'tokens/total': 47554560, 'tokens/trainable': 5851204, 'epoch': '0.1099'}
  5%|██████████▊                                                                                                                                                                                           | 36/656 [1:42:50<29:23:24, 170.65s/it]  6%|███████████▏                                                                                                                                                                                          | 37/656 [1:45:41<29:22:52, 170.88s/it]                                                                                                                                                                                                                                                  {'loss': '0.9436', 'grad_norm': '0.939', 'learning_rate': '0.0001', 'ppl': '2.569', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.31', 'tokens/total': 48875520, 'tokens/trainable': 5996815, 'epoch': '0.1129'}
  6%|███████████▏                                                                                                                                                                                          | 37/656 [1:45:41<29:22:52, 170.88s/it]  6%|███████████▍                                                                                                                                                                                          | 38/656 [1:48:32<29:18:41, 170.75s/it]                                                                                                                                                                                                                                                  {'loss': '0.9173', 'grad_norm': '0.8507', 'learning_rate': '0.0001', 'ppl': '2.503', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '46.7', 'tokens/total': 50196480, 'tokens/trainable': 6161541, 'epoch': '0.116'}
  6%|███████████▍                                                                                                                                                                                          | 38/656 [1:48:32<29:18:41, 170.75s/it]  6%|███████████▊                                                                                                                                                                                          | 39/656 [1:51:23<29:16:47, 170.84s/it]                                                                                                                                                                                                                                                  {'loss': '0.8948', 'grad_norm': '0.9191', 'learning_rate': '0.0001', 'ppl': '2.447', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '56.95', 'tokens/total': 51517440, 'tokens/trainable': 6321574, 'epoch': '0.119'}
  6%|███████████▊                                                                                                                                                                                          | 39/656 [1:51:23<29:16:47, 170.84s/it]  6%|████████████                                                                                                                                                                                          | 40/656 [1:54:15<29:16:58, 171.13s/it]                                                                                                                                                                                                                                                  {'loss': '0.8794', 'grad_norm': '0.8337', 'learning_rate': '0.0001', 'ppl': '2.409', 'memory/max_active (GiB)': '54.92', 'memory/max_allocated (GiB)': '54.92', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '47.94', 'tokens/total': 52838400, 'tokens/trainable': 6489392, 'epoch': '0.1221'}
  6%|████████████                                                                                                                                                                                          | 40/656 [1:54:15<29:16:58, 171.13s/it]  6%|████████████▍                                                                                                                                                                                         | 41/656 [1:57:05<29:11:33, 170.88s/it]                                                                                                                                                                                                                                                  {'loss': '0.8873', 'grad_norm': '0.8152', 'learning_rate': '0.0001', 'ppl': '2.429', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '48.68', 'tokens/total': 54159360, 'tokens/trainable': 6659837, 'epoch': '0.1251'}
  6%|████████████▍                                                                                                                                                                                         | 41/656 [1:57:05<29:11:33, 170.88s/it]  6%|████████████▋                                                                                                                                                                                         | 42/656 [1:59:55<29:05:18, 170.55s/it]                                                                                                                                                                                                                                                  {'loss': '0.8658', 'grad_norm': '0.8004', 'learning_rate': '0.0001', 'ppl': '2.377', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '41.69', 'tokens/total': 55480320, 'tokens/trainable': 6826668, 'epoch': '0.1282'}
  6%|████████████▋                                                                                                                                                                                         | 42/656 [1:59:55<29:05:18, 170.55s/it]  7%|████████████▉                                                                                                                                                                                         | 43/656 [2:02:46<29:04:20, 170.73s/it]                                                                                                                                                                                                                                                  {'loss': '0.9066', 'grad_norm': '0.816', 'learning_rate': '0.0001', 'ppl': '2.476', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '80.22', 'tokens/total': 56801280, 'tokens/trainable': 6987777, 'epoch': '0.1312'}
  7%|████████████▉                                                                                                                                                                                         | 43/656 [2:02:46<29:04:20, 170.73s/it]  7%|█████████████▎                                                                                                                                                                                        | 44/656 [2:05:37<29:03:15, 170.91s/it]                                                                                                                                                                                                                                                  {'loss': '0.8953', 'grad_norm': '0.8975', 'learning_rate': '0.0001', 'ppl': '2.448', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '38.56', 'tokens/total': 58122240, 'tokens/trainable': 7136651, 'epoch': '0.1343'}
  7%|█████████████▎                                                                                                                                                                                        | 44/656 [2:05:37<29:03:15, 170.91s/it]  7%|█████████████▌                                                                                                                                                                                        | 45/656 [2:08:31<29:08:36, 171.71s/it]                                                                                                                                                                                                                                                  {'loss': '0.8791', 'grad_norm': '0.8881', 'learning_rate': '0.0001', 'ppl': '2.409', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '79.42', 'tokens/total': 59443200, 'tokens/trainable': 7323254, 'epoch': '0.1373'}
  7%|█████████████▌                                                                                                                                                                                        | 45/656 [2:08:31<29:08:36, 171.71s/it]  7%|█████████████▉                                                                                                                                                                                        | 46/656 [2:11:22<29:05:04, 171.65s/it]                                                                                                                                                                                                                                                  {'loss': '0.9379', 'grad_norm': '0.9149', 'learning_rate': '0.0001', 'ppl': '2.554', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '88.33', 'tokens/total': 60764160, 'tokens/trainable': 7494494, 'epoch': '0.1404'}
  7%|█████████████▉                                                                                                                                                                                        | 46/656 [2:11:22<29:05:04, 171.65s/it]  7%|██████████████▏                                                                                                                                                                                       | 47/656 [2:14:13<29:00:00, 171.43s/it]                                                                                                                                                                                                                                                  {'loss': '0.9145', 'grad_norm': '0.8509', 'learning_rate': '0.0001', 'ppl': '2.496', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '80.74', 'tokens/total': 62085120, 'tokens/trainable': 7647130, 'epoch': '0.1434'}
  7%|██████████████▏                                                                                                                                                                                       | 47/656 [2:14:13<29:00:00, 171.43s/it]  7%|██████████████▍                                                                                                                                                                                       | 48/656 [2:17:04<28:54:38, 171.18s/it]                                                                                                                                                                                                                                                  {'loss': '0.8936', 'grad_norm': '0.8986', 'learning_rate': '0.0001', 'ppl': '2.444', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.85', 'tokens/total': 63406080, 'tokens/trainable': 7801065, 'epoch': '0.1465'}
  7%|██████████████▍                                                                                                                                                                                       | 48/656 [2:17:04<28:54:38, 171.18s/it]  7%|██████████████▊                                                                                                                                                                                       | 49/656 [2:19:56<28:56:26, 171.64s/it]                                                                                                                                                                                                                                                  {'loss': '0.8865', 'grad_norm': '3.433', 'learning_rate': '0.0001', 'ppl': '2.427', 'memory/max_active (GiB)': '54.93', 'memory/max_allocated (GiB)': '54.93', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.75', 'tokens/total': 64727040, 'tokens/trainable': 7954761, 'epoch': '0.1495'}
  7%|██████████████▊                                                                                                                                                                                       | 49/656 [2:19:56<28:56:26, 171.64s/it]  8%|███████████████                                                                                                                                                                                       | 50/656 [2:22:48<28:53:30, 171.63s/it]                                                                                                                                                                                                                                                  {'loss': '0.9334', 'grad_norm': '0.8918', 'learning_rate': '0.0001', 'ppl': '2.543', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.31', 'tokens/total': 66048000, 'tokens/trainable': 8119003, 'epoch': '0.1526'}
  8%|███████████████                                                                                                                                                                                       | 50/656 [2:22:48<28:53:30, 171.63s/it]  8%|███████████████▍                                                                                                                                                                                      | 51/656 [2:25:38<28:46:33, 171.23s/it]                                                                                                                                                                                                                                                  {'loss': '0.9171', 'grad_norm': '0.8382', 'learning_rate': '0.0001', 'ppl': '2.502', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '39.61', 'tokens/total': 67368960, 'tokens/trainable': 8282647, 'epoch': '0.1556'}
  8%|███████████████▍                                                                                                                                                                                      | 51/656 [2:25:38<28:46:33, 171.23s/it]  8%|███████████████▋                                                                                                                                                                                      | 52/656 [2:28:28<28:39:22, 170.80s/it]                                                                                                                                                                                                                                                  {'loss': '0.8981', 'grad_norm': '0.8381', 'learning_rate': '0.0001', 'ppl': '2.455', 'memory/max_active (GiB)': '54.91', 'memory/max_allocated (GiB)': '54.91', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '42.09', 'tokens/total': 68689920, 'tokens/trainable': 8426014, 'epoch': '0.1587'}
  8%|███████████████▋                                                                                                                                                                                      | 52/656 [2:28:28<28:39:22, 170.80s/it]  8%|███████████████▉                                                                                                                                                                                      | 53/656 [2:31:19<28:36:19, 170.78s/it]                                                                                                                                                                                                                                                  {'loss': '0.9454', 'grad_norm': '0.9364', 'learning_rate': '0.0001', 'ppl': '2.574', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.38', 'tokens/total': 70010880, 'tokens/trainable': 8561211, 'epoch': '0.1617'}
  8%|███████████████▉                                                                                                                                                                                      | 53/656 [2:31:19<28:36:19, 170.78s/it]  8%|████████████████▎                                                                                                                                                                                     | 54/656 [2:34:10<28:35:38, 170.99s/it]                                                                                                                                                                                                                                                  {'loss': '0.8845', 'grad_norm': '0.8288', 'learning_rate': '0.0001', 'ppl': '2.422', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '27.75', 'tokens/total': 71331840, 'tokens/trainable': 8713468, 'epoch': '0.1648'}
  8%|████████████████▎                                                                                                                                                                                     | 54/656 [2:34:10<28:35:38, 170.99s/it]  8%|████████████████▌                                                                                                                                                                                     | 55/656 [2:37:00<28:29:30, 170.67s/it]                                                                                                                                                                                                                                                  {'loss': '0.8758', 'grad_norm': '0.8237', 'learning_rate': '0.0001', 'ppl': '2.401', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '60.11', 'tokens/total': 72652800, 'tokens/trainable': 8871620, 'epoch': '0.1678'}
  8%|████████████████▌                                                                                                                                                                                     | 55/656 [2:37:00<28:29:30, 170.67s/it]  9%|████████████████▉                                                                                                                                                                                     | 56/656 [2:39:50<28:22:18, 170.23s/it]                                                                                                                                                                                                                                                  {'loss': '0.8782', 'grad_norm': '0.802', 'learning_rate': '0.0001', 'ppl': '2.407', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '83.41', 'tokens/total': 73973760, 'tokens/trainable': 9030771, 'epoch': '0.1709'}
  9%|████████████████▉                                                                                                                                                                                     | 56/656 [2:39:50<28:22:18, 170.23s/it]  9%|█████████████████▏                                                                                                                                                                                    | 57/656 [2:42:38<28:15:33, 169.84s/it]                                                                                                                                                                                                                                                  {'loss': '0.8981', 'grad_norm': '0.8389', 'learning_rate': '0.0001', 'ppl': '2.455', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '34.89', 'tokens/total': 75294720, 'tokens/trainable': 9188236, 'epoch': '0.1739'}
  9%|█████████████████▏                                                                                                                                                                                    | 57/656 [2:42:38<28:15:33, 169.84s/it]  9%|█████████████████▌                                                                                                                                                                                    | 58/656 [2:45:30<28:18:25, 170.41s/it]                                                                                                                                                                                                                                                  {'loss': '0.8973', 'grad_norm': '0.8227', 'learning_rate': '0.0001', 'ppl': '2.453', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '56.63', 'tokens/total': 76615680, 'tokens/trainable': 9347338, 'epoch': '0.177'}
  9%|█████████████████▌                                                                                                                                                                                    | 58/656 [2:45:30<28:18:25, 170.41s/it]  9%|█████████████████▊                                                                                                                                                                                    | 59/656 [2:48:22<28:20:09, 170.87s/it]                                                                                                                                                                                                                                                  {'loss': '0.9365', 'grad_norm': '0.8344', 'learning_rate': '0.0001', 'ppl': '2.551', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.61', 'tokens/total': 77936640, 'tokens/trainable': 9497784, 'epoch': '0.18'}
  9%|█████████████████▊                                                                                                                                                                                    | 59/656 [2:48:22<28:20:09, 170.87s/it]  9%|██████████████████                                                                                                                                                                                    | 60/656 [2:51:14<28:20:46, 171.22s/it]                                                                                                                                                                                                                                                  {'loss': '0.8966', 'grad_norm': '0.7768', 'learning_rate': '0.0001', 'ppl': '2.451', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '68.78', 'tokens/total': 79257600, 'tokens/trainable': 9668089, 'epoch': '0.1831'}
  9%|██████████████████                                                                                                                                                                                    | 60/656 [2:51:14<28:20:46, 171.22s/it]  9%|██████████████████▍                                                                                                                                                                                   | 61/656 [2:54:04<28:13:06, 170.73s/it]                                                                                                                                                                                                                                                  {'loss': '0.9366', 'grad_norm': '0.8203', 'learning_rate': '0.0001', 'ppl': '2.551', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '59.26', 'tokens/total': 80578560, 'tokens/trainable': 9823884, 'epoch': '0.1861'}
  9%|██████████████████▍                                                                                                                                                                                   | 61/656 [2:54:04<28:13:06, 170.73s/it]  9%|██████████████████▋                                                                                                                                                                                   | 62/656 [2:56:54<28:07:35, 170.46s/it]                                                                                                                                                                                                                                                  {'loss': '0.9094', 'grad_norm': '0.7661', 'learning_rate': '0.0001', 'ppl': '2.483', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '61.99', 'tokens/total': 81899520, 'tokens/trainable': 10007094, 'epoch': '0.1892'}
  9%|██████████████████▋                                                                                                                                                                                   | 62/656 [2:56:54<28:07:35, 170.46s/it] 10%|███████████████████                                                                                                                                                                                   | 63/656 [2:59:47<28:12:56, 171.29s/it]                                                                                                                                                                                                                                                  {'loss': '0.9166', 'grad_norm': '0.7874', 'learning_rate': '0.0001', 'ppl': '2.501', 'memory/max_active (GiB)': '54.98', 'memory/max_allocated (GiB)': '54.98', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.45', 'tokens/total': 83220480, 'tokens/trainable': 10172171, 'epoch': '0.1922'}
 10%|███████████████████                                                                                                                                                                                   | 63/656 [2:59:47<28:12:56, 171.29s/it] 10%|███████████████████▎                                                                                                                                                                                  | 64/656 [3:02:37<28:07:21, 171.02s/it]                                                                                                                                                                                                                                                  {'loss': '0.8639', 'grad_norm': '0.7861', 'learning_rate': '0.0001', 'ppl': '2.372', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '51.58', 'tokens/total': 84541440, 'tokens/trainable': 10341442, 'epoch': '0.1953'}
 10%|███████████████████▎                                                                                                                                                                                  | 64/656 [3:02:37<28:07:21, 171.02s/it] 10%|███████████████████▌                                                                                                                                                                                  | 65/656 [3:05:27<27:59:58, 170.56s/it]                                                                                                                                                                                                                                                  {'loss': '0.8918', 'grad_norm': '0.7767', 'learning_rate': '0.0001', 'ppl': '2.439', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '56.22', 'tokens/total': 85862400, 'tokens/trainable': 10494868, 'epoch': '0.1984'}
 10%|███████████████████▌                                                                                                                                                                                  | 65/656 [3:05:27<27:59:58, 170.56s/it] 10%|███████████████████▉                                                                                                                                                                                  | 66/656 [3:08:14<27:48:55, 169.72s/it]                                                                                                                                                                                                                                                  {'loss': '0.8809', 'grad_norm': '0.8798', 'learning_rate': '0.0001', 'ppl': '2.413', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.81', 'tokens/total': 87183360, 'tokens/trainable': 10628907, 'epoch': '0.2014'}
 10%|███████████████████▉                                                                                                                                                                                  | 66/656 [3:08:14<27:48:55, 169.72s/it] 10%|████████████████████▏                                                                                                                                                                                 | 67/656 [3:11:06<27:50:39, 170.19s/it]                                                                                                                                                                                                                                                  {'loss': '0.8368', 'grad_norm': '0.8187', 'learning_rate': '0.0001', 'ppl': '2.309', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '54.11', 'tokens/total': 88504320, 'tokens/trainable': 10799712, 'epoch': '0.2045'}
 10%|████████████████████▏                                                                                                                                                                                 | 67/656 [3:11:06<27:50:39, 170.19s/it] 10%|████████████████████▌                                                                                                                                                                                 | 68/656 [3:13:54<27:42:23, 169.63s/it]                                                                                                                                                                                                                                                  {'loss': '0.8768', 'grad_norm': '0.8515', 'learning_rate': '0.0001', 'ppl': '2.403', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '37.95', 'tokens/total': 89825280, 'tokens/trainable': 10951671, 'epoch': '0.2075'}
 10%|████████████████████▌                                                                                                                                                                                 | 68/656 [3:13:54<27:42:23, 169.63s/it] 11%|████████████████████▊                                                                                                                                                                                 | 69/656 [3:16:45<27:42:28, 169.93s/it]                                                                                                                                                                                                                                                  {'loss': '0.9084', 'grad_norm': '0.8051', 'learning_rate': '0.0001', 'ppl': '2.48', 'memory/max_active (GiB)': '54.91', 'memory/max_allocated (GiB)': '54.91', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '76.09', 'tokens/total': 91146240, 'tokens/trainable': 11102524, 'epoch': '0.2106'}
 11%|████████████████████▊                                                                                                                                                                                 | 69/656 [3:16:45<27:42:28, 169.93s/it] 11%|█████████████████████▏                                                                                                                                                                                | 70/656 [3:19:36<27:44:58, 170.48s/it]                                                                                                                                                                                                                                                  {'loss': '0.867', 'grad_norm': '0.8534', 'learning_rate': '0.0001', 'ppl': '2.38', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '60.02', 'tokens/total': 92467200, 'tokens/trainable': 11249443, 'epoch': '0.2136'}
 11%|█████████████████████▏                                                                                                                                                                                | 70/656 [3:19:36<27:44:58, 170.48s/it] 11%|█████████████████████▍                                                                                                                                                                                | 71/656 [3:22:28<27:44:13, 170.69s/it]                                                                                                                                                                                                                                                  {'loss': '0.8844', 'grad_norm': '0.7819', 'learning_rate': '0.0001', 'ppl': '2.421', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '70.93', 'tokens/total': 93788160, 'tokens/trainable': 11415696, 'epoch': '0.2167'}
 11%|█████████████████████▍                                                                                                                                                                                | 71/656 [3:22:28<27:44:13, 170.69s/it] 11%|█████████████████████▋                                                                                                                                                                                | 72/656 [3:25:18<27:41:31, 170.71s/it]                                                                                                                                                                                                                                                  {'loss': '0.8656', 'grad_norm': '0.8254', 'learning_rate': '0.0001', 'ppl': '2.376', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '30.91', 'tokens/total': 95109120, 'tokens/trainable': 11574940, 'epoch': '0.2197'}
 11%|█████████████████████▋                                                                                                                                                                                | 72/656 [3:25:18<27:41:31, 170.71s/it] 11%|██████████████████████                                                                                                                                                                                | 73/656 [3:28:10<27:42:08, 171.06s/it]                                                                                                                                                                                                                                                  {'loss': '0.8649', 'grad_norm': '0.7573', 'learning_rate': '0.0001', 'ppl': '2.375', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '76.06', 'tokens/total': 96430080, 'tokens/trainable': 11738498, 'epoch': '0.2228'}
 11%|██████████████████████                                                                                                                                                                                | 73/656 [3:28:10<27:42:08, 171.06s/it] 11%|██████████████████████▎                                                                                                                                                                               | 74/656 [3:31:00<27:36:01, 170.72s/it]                                                                                                                                                                                                                                                  {'loss': '0.9141', 'grad_norm': '0.8495', 'learning_rate': '0.0001', 'ppl': '2.495', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '63.76', 'tokens/total': 97751040, 'tokens/trainable': 11893341, 'epoch': '0.2258'}
 11%|██████████████████████▎                                                                                                                                                                               | 74/656 [3:31:00<27:36:01, 170.72s/it] 11%|██████████████████████▋                                                                                                                                                                               | 75/656 [3:33:50<27:31:46, 170.58s/it]                                                                                                                                                                                                                                                  {'loss': '0.8698', 'grad_norm': '0.8239', 'learning_rate': '0.0001', 'ppl': '2.387', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '64.66', 'tokens/total': 99072000, 'tokens/trainable': 12041845, 'epoch': '0.2289'}
 11%|██████████████████████▋                                                                                                                                                                               | 75/656 [3:33:50<27:31:46, 170.58s/it] 12%|██████████████████████▉                                                                                                                                                                               | 76/656 [3:36:41<27:29:07, 170.60s/it]                                                                                                                                                                                                                                                  {'loss': '0.8339', 'grad_norm': '0.7571', 'learning_rate': '0.0001', 'ppl': '2.302', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44', 'tokens/total': 100392960, 'tokens/trainable': 12223131, 'epoch': '0.2319'}
 12%|██████████████████████▉                                                                                                                                                                               | 76/656 [3:36:41<27:29:07, 170.60s/it] 12%|███████████████████████▏                                                                                                                                                                              | 77/656 [3:39:31<27:25:23, 170.51s/it]                                                                                                                                                                                                                                                  {'loss': '0.9047', 'grad_norm': '0.8505', 'learning_rate': '0.0001', 'ppl': '2.471', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '42.49', 'tokens/total': 101713920, 'tokens/trainable': 12370717, 'epoch': '0.235'}
 12%|███████████████████████▏                                                                                                                                                                              | 77/656 [3:39:31<27:25:23, 170.51s/it] 12%|███████████████████████▌                                                                                                                                                                              | 78/656 [3:42:22<27:22:03, 170.46s/it]                                                                                                                                                                                                                                                  {'loss': '0.8823', 'grad_norm': '0.7988', 'learning_rate': '0.0001', 'ppl': '2.416', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '28.76', 'tokens/total': 103034880, 'tokens/trainable': 12515477, 'epoch': '0.238'}
 12%|███████████████████████▌                                                                                                                                                                              | 78/656 [3:42:22<27:22:03, 170.46s/it] 12%|███████████████████████▊                                                                                                                                                                              | 79/656 [3:45:13<27:22:30, 170.80s/it]                                                                                                                                                                                                                                                  {'loss': '0.8695', 'grad_norm': '0.8316', 'learning_rate': '0.0001', 'ppl': '2.386', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '55.03', 'tokens/total': 104355840, 'tokens/trainable': 12672146, 'epoch': '0.2411'}
 12%|███████████████████████▊                                                                                                                                                                              | 79/656 [3:45:13<27:22:30, 170.80s/it] 12%|████████████████████████▏                                                                                                                                                                             | 80/656 [3:48:04<27:19:01, 170.73s/it]                                                                                                                                                                                                                                                  {'loss': '0.8099', 'grad_norm': '0.7153', 'learning_rate': '0.0001', 'ppl': '2.248', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '87.22', 'tokens/total': 105676800, 'tokens/trainable': 12848187, 'epoch': '0.2441'}
 12%|████████████████████████▏                                                                                                                                                                             | 80/656 [3:48:04<27:19:01, 170.73s/it] 12%|████████████████████████▍                                                                                                                                                                             | 81/656 [3:50:54<27:15:24, 170.65s/it]                                                                                                                                                                                                                                                  {'loss': '0.8544', 'grad_norm': '0.8062', 'learning_rate': '0.0001', 'ppl': '2.35', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '42.38', 'tokens/total': 106997760, 'tokens/trainable': 12996317, 'epoch': '0.2472'}
 12%|████████████████████████▍                                                                                                                                                                             | 81/656 [3:50:54<27:15:24, 170.65s/it] 12%|████████████████████████▊                                                                                                                                                                             | 82/656 [3:53:44<27:09:23, 170.32s/it]                                                                                                                                                                                                                                                  {'loss': '0.871', 'grad_norm': '0.7938', 'learning_rate': '0.0001', 'ppl': '2.389', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '68.63', 'tokens/total': 108318720, 'tokens/trainable': 13153337, 'epoch': '0.2502'}
 12%|████████████████████████▊                                                                                                                                                                             | 82/656 [3:53:44<27:09:23, 170.32s/it] 13%|█████████████████████████                                                                                                                                                                             | 83/656 [3:56:36<27:10:32, 170.74s/it]                                                                                                                                                                                                                                                  {'loss': '0.8476', 'grad_norm': '0.745', 'learning_rate': '0.0001', 'ppl': '2.334', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '34.78', 'tokens/total': 109639680, 'tokens/trainable': 13318540, 'epoch': '0.2533'}
 13%|█████████████████████████                                                                                                                                                                             | 83/656 [3:56:36<27:10:32, 170.74s/it] 13%|█████████████████████████▎                                                                                                                                                                            | 84/656 [3:59:28<27:12:17, 171.22s/it]                                                                                                                                                                                                                                                  {'loss': '0.873', 'grad_norm': '0.7387', 'learning_rate': '0.0001', 'ppl': '2.394', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '52.54', 'tokens/total': 110960640, 'tokens/trainable': 13493329, 'epoch': '0.2563'}
 13%|█████████████████████████▎                                                                                                                                                                            | 84/656 [3:59:28<27:12:17, 171.22s/it] 13%|█████████████████████████▋                                                                                                                                                                            | 85/656 [4:02:17<27:03:28, 170.59s/it]                                                                                                                                                                                                                                                  {'loss': '0.8793', 'grad_norm': '0.7587', 'learning_rate': '0.0001', 'ppl': '2.409', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '29.7', 'tokens/total': 112281600, 'tokens/trainable': 13650671, 'epoch': '0.2594'}
 13%|█████████████████████████▋                                                                                                                                                                            | 85/656 [4:02:17<27:03:28, 170.59s/it] 13%|█████████████████████████▉                                                                                                                                                                            | 86/656 [4:05:08<27:00:24, 170.57s/it]                                                                                                                                                                                                                                                  {'loss': '0.9514', 'grad_norm': '0.7783', 'learning_rate': '0.0001', 'ppl': '2.589', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '61.75', 'tokens/total': 113602560, 'tokens/trainable': 13815620, 'epoch': '0.2624'}
 13%|█████████████████████████▉                                                                                                                                                                            | 86/656 [4:05:08<27:00:24, 170.57s/it] 13%|██████████████████████████▎                                                                                                                                                                           | 87/656 [4:07:59<27:00:33, 170.89s/it]                                                                                                                                                                                                                                                  {'loss': '0.8453', 'grad_norm': '0.7085', 'learning_rate': '0.0001', 'ppl': '2.329', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.1', 'tokens/total': 114923520, 'tokens/trainable': 14001464, 'epoch': '0.2655'}
 13%|██████████████████████████▎                                                                                                                                                                           | 87/656 [4:07:59<27:00:33, 170.89s/it] 13%|██████████████████████████▌                                                                                                                                                                           | 88/656 [4:10:49<26:55:16, 170.63s/it]                                                                                                                                                                                                                                                  {'loss': '0.9103', 'grad_norm': '0.8017', 'learning_rate': '0.0001', 'ppl': '2.485', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '75.92', 'tokens/total': 116244480, 'tokens/trainable': 14150011, 'epoch': '0.2685'}
 13%|██████████████████████████▌                                                                                                                                                                           | 88/656 [4:10:49<26:55:16, 170.63s/it] 14%|██████████████████████████▊                                                                                                                                                                           | 89/656 [4:13:40<26:52:05, 170.59s/it]                                                                                                                                                                                                                                                  {'loss': '0.8894', 'grad_norm': '0.715', 'learning_rate': '0.0001', 'ppl': '2.434', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '68.68', 'tokens/total': 117565440, 'tokens/trainable': 14312944, 'epoch': '0.2716'}
 14%|██████████████████████████▊                                                                                                                                                                           | 89/656 [4:13:40<26:52:05, 170.59s/it] 14%|███████████████████████████▏                                                                                                                                                                          | 90/656 [4:16:30<26:47:08, 170.37s/it]                                                                                                                                                                                                                                                  {'loss': '0.8916', 'grad_norm': '0.7911', 'learning_rate': '0.0001', 'ppl': '2.439', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '42.61', 'tokens/total': 118886400, 'tokens/trainable': 14462956, 'epoch': '0.2746'}
 14%|███████████████████████████▏                                                                                                                                                                          | 90/656 [4:16:30<26:47:08, 170.37s/it] 14%|███████████████████████████▍                                                                                                                                                                          | 91/656 [4:19:20<26:44:30, 170.39s/it]                                                                                                                                                                                                                                                  {'loss': '0.8405', 'grad_norm': '0.8099', 'learning_rate': '0.0001', 'ppl': '2.317', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '32.44', 'tokens/total': 120207360, 'tokens/trainable': 14621211, 'epoch': '0.2777'}
 14%|███████████████████████████▍                                                                                                                                                                          | 91/656 [4:19:20<26:44:30, 170.39s/it] 14%|███████████████████████████▊                                                                                                                                                                          | 92/656 [4:22:10<26:41:20, 170.35s/it]                                                                                                                                                                                                                                                  {'loss': '0.8237', 'grad_norm': '0.7495', 'learning_rate': '0.0001', 'ppl': '2.279', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '35.91', 'tokens/total': 121528320, 'tokens/trainable': 14787950, 'epoch': '0.2807'}
 14%|███████████████████████████▊                                                                                                                                                                          | 92/656 [4:22:10<26:41:20, 170.35s/it] 14%|████████████████████████████                                                                                                                                                                          | 93/656 [4:25:02<26:42:26, 170.77s/it]                                                                                                                                                                                                                                                  {'loss': '0.8995', 'grad_norm': '0.787', 'learning_rate': '0.0001', 'ppl': '2.458', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '61.64', 'tokens/total': 122849280, 'tokens/trainable': 14936103, 'epoch': '0.2838'}
 14%|████████████████████████████                                                                                                                                                                          | 93/656 [4:25:02<26:42:26, 170.77s/it] 14%|████████████████████████████▎                                                                                                                                                                         | 94/656 [4:27:51<26:35:10, 170.30s/it]                                                                                                                                                                                                                                                  {'loss': '0.8676', 'grad_norm': '0.7796', 'learning_rate': '0.0001', 'ppl': '2.381', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '47.18', 'tokens/total': 124170240, 'tokens/trainable': 15085216, 'epoch': '0.2868'}
 14%|████████████████████████████▎                                                                                                                                                                         | 94/656 [4:27:51<26:35:10, 170.30s/it] 14%|████████████████████████████▋                                                                                                                                                                         | 95/656 [4:30:42<26:33:34, 170.44s/it]                                                                                                                                                                                                                                                  {'loss': '0.8637', 'grad_norm': '0.7605', 'learning_rate': '0.0001', 'ppl': '2.372', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '71', 'tokens/total': 125491200, 'tokens/trainable': 15237035, 'epoch': '0.2899'}
 14%|████████████████████████████▋                                                                                                                                                                         | 95/656 [4:30:42<26:33:34, 170.44s/it] 15%|████████████████████████████▉                                                                                                                                                                         | 96/656 [4:33:34<26:34:58, 170.89s/it]                                                                                                                                                                                                                                                  {'loss': '0.8973', 'grad_norm': '0.7527', 'learning_rate': '0.0001', 'ppl': '2.453', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '34.98', 'tokens/total': 126812160, 'tokens/trainable': 15403077, 'epoch': '0.293'}
 15%|████████████████████████████▉                                                                                                                                                                         | 96/656 [4:33:34<26:34:58, 170.89s/it] 15%|█████████████████████████████▎                                                                                                                                                                        | 97/656 [4:36:22<26:24:26, 170.06s/it]                                                                                                                                                                                                                                                  {'loss': '0.8625', 'grad_norm': '0.7931', 'learning_rate': '0.0001', 'ppl': '2.369', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '28.55', 'tokens/total': 128133120, 'tokens/trainable': 15546927, 'epoch': '0.296'}
 15%|█████████████████████████████▎                                                                                                                                                                        | 97/656 [4:36:22<26:24:26, 170.06s/it] 15%|█████████████████████████████▌                                                                                                                                                                        | 98/656 [4:39:12<26:19:56, 169.89s/it]                                                                                                                                                                                                                                                  {'loss': '0.8661', 'grad_norm': '0.8101', 'learning_rate': '0.0001', 'ppl': '2.378', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '23.14', 'tokens/total': 129454080, 'tokens/trainable': 15684794, 'epoch': '0.2991'}
 15%|█████████████████████████████▌                                                                                                                                                                        | 98/656 [4:39:12<26:19:56, 169.89s/it] 15%|█████████████████████████████▉                                                                                                                                                                        | 99/656 [4:42:03<26:20:12, 170.22s/it]                                                                                                                                                                                                                                                  {'loss': '0.8671', 'grad_norm': '0.7717', 'learning_rate': '0.0001', 'ppl': '2.38', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '62.1', 'tokens/total': 130775040, 'tokens/trainable': 15842270, 'epoch': '0.3021'}
 15%|█████████████████████████████▉                                                                                                                                                                        | 99/656 [4:42:03<26:20:12, 170.22s/it] 15%|██████████████████████████████                                                                                                                                                                       | 100/656 [4:44:54<26:21:13, 170.64s/it]                                                                                                                                                                                                                                                  {'loss': '0.8827', 'grad_norm': '0.7209', 'learning_rate': '0.0001', 'ppl': '2.417', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '65.56', 'tokens/total': 132096000, 'tokens/trainable': 16016428, 'epoch': '0.3052'}
 15%|██████████████████████████████                                                                                                                                                                       | 100/656 [4:44:54<26:21:13, 170.64s/it] 15%|██████████████████████████████▎                                                                                                                                                                      | 101/656 [4:47:46<26:21:19, 170.95s/it]                                                                                                                                                                                                                                                  {'loss': '0.8403', 'grad_norm': '0.6891', 'learning_rate': '0.0001', 'ppl': '2.317', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.09', 'tokens/total': 133416960, 'tokens/trainable': 16195130, 'epoch': '0.3082'}
 15%|██████████████████████████████▎                                                                                                                                                                      | 101/656 [4:47:46<26:21:19, 170.95s/it] 16%|██████████████████████████████▋                                                                                                                                                                      | 102/656 [4:50:37<26:18:29, 170.96s/it]                                                                                                                                                                                                                                                  {'loss': '0.8643', 'grad_norm': '0.7752', 'learning_rate': '0.0001', 'ppl': '2.373', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '48.44', 'tokens/total': 134737920, 'tokens/trainable': 16341188, 'epoch': '0.3113'}
 16%|██████████████████████████████▋                                                                                                                                                                      | 102/656 [4:50:37<26:18:29, 170.96s/it] 16%|██████████████████████████████▉                                                                                                                                                                      | 103/656 [4:53:28<26:16:23, 171.04s/it]                                                                                                                                                                                                                                                  {'loss': '0.8479', 'grad_norm': '0.7337', 'learning_rate': '0.0001', 'ppl': '2.335', 'memory/max_active (GiB)': '54.91', 'memory/max_allocated (GiB)': '54.91', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.06', 'tokens/total': 136058880, 'tokens/trainable': 16518649, 'epoch': '0.3143'}
 16%|██████████████████████████████▉                                                                                                                                                                      | 103/656 [4:53:28<26:16:23, 171.04s/it] 16%|███████████████████████████████▏                                                                                                                                                                     | 104/656 [4:56:17<26:08:27, 170.48s/it]                                                                                                                                                                                                                                                  {'loss': '0.8143', 'grad_norm': '0.7454', 'learning_rate': '0.0001', 'ppl': '2.258', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '69.83', 'tokens/total': 137379840, 'tokens/trainable': 16681305, 'epoch': '0.3174'}
 16%|███████████████████████████████▏                                                                                                                                                                     | 104/656 [4:56:17<26:08:27, 170.48s/it] 16%|███████████████████████████████▌                                                                                                                                                                     | 105/656 [4:59:09<26:08:23, 170.79s/it]                                                                                                                                                                                                                                                  {'loss': '0.8222', 'grad_norm': '0.7299', 'learning_rate': '0.0001', 'ppl': '2.276', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '53.5', 'tokens/total': 138700800, 'tokens/trainable': 16851516, 'epoch': '0.3204'}
 16%|███████████████████████████████▌                                                                                                                                                                     | 105/656 [4:59:09<26:08:23, 170.79s/it] 16%|███████████████████████████████▊                                                                                                                                                                     | 106/656 [5:02:01<26:08:53, 171.15s/it]                                                                                                                                                                                                                                                  {'loss': '0.8119', 'grad_norm': '0.6862', 'learning_rate': '0.0001', 'ppl': '2.252', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '80.81', 'tokens/total': 140021760, 'tokens/trainable': 17031334, 'epoch': '0.3235'}
 16%|███████████████████████████████▊                                                                                                                                                                     | 106/656 [5:02:01<26:08:53, 171.15s/it] 16%|████████████████████████████████▏                                                                                                                                                                    | 107/656 [5:04:51<26:02:31, 170.77s/it]                                                                                                                                                                                                                                                  {'loss': '0.862', 'grad_norm': '0.7865', 'learning_rate': '0.0001', 'ppl': '2.368', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '80.61', 'tokens/total': 141342720, 'tokens/trainable': 17185768, 'epoch': '0.3265'}
 16%|████████████████████████████████▏                                                                                                                                                                    | 107/656 [5:04:51<26:02:31, 170.77s/it] 16%|████████████████████████████████▍                                                                                                                                                                    | 108/656 [5:07:42<26:00:47, 170.89s/it]                                                                                                                                                                                                                                                  {'loss': '0.8688', 'grad_norm': '0.7269', 'learning_rate': '0.0001', 'ppl': '2.384', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '41.14', 'tokens/total': 142663680, 'tokens/trainable': 17350094, 'epoch': '0.3296'}
 16%|████████████████████████████████▍                                                                                                                                                                    | 108/656 [5:07:42<26:00:47, 170.89s/it] 17%|████████████████████████████████▋                                                                                                                                                                    | 109/656 [5:10:34<26:00:35, 171.18s/it]                                                                                                                                                                                                                                                  {'loss': '0.8202', 'grad_norm': '0.7069', 'learning_rate': '0.0001', 'ppl': '2.271', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '35.93', 'tokens/total': 143984640, 'tokens/trainable': 17524944, 'epoch': '0.3326'}
 17%|████████████████████████████████▋                                                                                                                                                                    | 109/656 [5:10:34<26:00:35, 171.18s/it] 17%|█████████████████████████████████                                                                                                                                                                    | 110/656 [5:13:22<25:50:59, 170.44s/it]                                                                                                                                                                                                                                                  {'loss': '0.8518', 'grad_norm': '0.7495', 'learning_rate': '0.0001', 'ppl': '2.344', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.34', 'tokens/total': 145305600, 'tokens/trainable': 17689756, 'epoch': '0.3357'}
 17%|█████████████████████████████████                                                                                                                                                                    | 110/656 [5:13:22<25:50:59, 170.44s/it] 17%|█████████████████████████████████▎                                                                                                                                                                   | 111/656 [5:16:12<25:45:49, 170.18s/it]                                                                                                                                                                                                                                                  {'loss': '0.8382', 'grad_norm': '0.7678', 'learning_rate': '0.0001', 'ppl': '2.312', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '33.23', 'tokens/total': 146626560, 'tokens/trainable': 17833360, 'epoch': '0.3387'}
 17%|█████████████████████████████████▎                                                                                                                                                                   | 111/656 [5:16:12<25:45:49, 170.18s/it] 17%|█████████████████████████████████▋                                                                                                                                                                   | 112/656 [5:19:03<25:45:30, 170.46s/it]                                                                                                                                                                                                                                                  {'loss': '0.8604', 'grad_norm': '0.7491', 'learning_rate': '0.0001', 'ppl': '2.364', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.6', 'tokens/total': 147947520, 'tokens/trainable': 17996872, 'epoch': '0.3418'}
 17%|█████████████████████████████████▋                                                                                                                                                                   | 112/656 [5:19:03<25:45:30, 170.46s/it] 17%|█████████████████████████████████▉                                                                                                                                                                   | 113/656 [5:21:55<25:46:38, 170.90s/it]                                                                                                                                                                                                                                                  {'loss': '0.8237', 'grad_norm': '0.7197', 'learning_rate': '0.0001', 'ppl': '2.279', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '32.93', 'tokens/total': 149268480, 'tokens/trainable': 18160752, 'epoch': '0.3448'}
 17%|█████████████████████████████████▉                                                                                                                                                                   | 113/656 [5:21:55<25:46:38, 170.90s/it] 17%|██████████████████████████████████▏                                                                                                                                                                  | 114/656 [5:24:46<25:43:07, 170.83s/it]                                                                                                                                                                                                                                                  {'loss': '0.8788', 'grad_norm': '0.7852', 'learning_rate': '0.0001', 'ppl': '2.408', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '27.35', 'tokens/total': 150589440, 'tokens/trainable': 18317564, 'epoch': '0.3479'}
 17%|██████████████████████████████████▏                                                                                                                                                                  | 114/656 [5:24:46<25:43:07, 170.83s/it] 18%|██████████████████████████████████▌                                                                                                                                                                  | 115/656 [5:27:37<25:40:44, 170.88s/it]                                                                                                                                                                                                                                                  {'loss': '0.8991', 'grad_norm': '0.7186', 'learning_rate': '0.0001', 'ppl': '2.457', 'memory/max_active (GiB)': '54.96', 'memory/max_allocated (GiB)': '54.96', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '67.67', 'tokens/total': 151910400, 'tokens/trainable': 18494308, 'epoch': '0.3509'}
 18%|██████████████████████████████████▌                                                                                                                                                                  | 115/656 [5:27:37<25:40:44, 170.88s/it] 18%|██████████████████████████████████▊                                                                                                                                                                  | 116/656 [5:30:28<25:39:39, 171.07s/it]                                                                                                                                                                                                                                                  {'loss': '0.8598', 'grad_norm': '0.7497', 'learning_rate': '0.0001', 'ppl': '2.363', 'memory/max_active (GiB)': '54.78', 'memory/max_allocated (GiB)': '54.78', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '52.68', 'tokens/total': 153231360, 'tokens/trainable': 18652172, 'epoch': '0.354'}
 18%|██████████████████████████████████▊                                                                                                                                                                  | 116/656 [5:30:28<25:39:39, 171.07s/it] 18%|███████████████████████████████████▏                                                                                                                                                                 | 117/656 [5:33:18<25:34:19, 170.80s/it]                                                                                                                                                                                                                                                  {'loss': '0.8334', 'grad_norm': '0.7379', 'learning_rate': '0.0001', 'ppl': '2.301', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '35.49', 'tokens/total': 154552320, 'tokens/trainable': 18805200, 'epoch': '0.357'}
 18%|███████████████████████████████████▏                                                                                                                                                                 | 117/656 [5:33:18<25:34:19, 170.80s/it] 18%|███████████████████████████████████▍                                                                                                                                                                 | 118/656 [5:36:09<25:30:29, 170.69s/it]                                                                                                                                                                                                                                                  {'loss': '0.8378', 'grad_norm': '0.7287', 'learning_rate': '0.0001', 'ppl': '2.311', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '59.76', 'tokens/total': 155873280, 'tokens/trainable': 18970060, 'epoch': '0.3601'}
 18%|███████████████████████████████████▍                                                                                                                                                                 | 118/656 [5:36:09<25:30:29, 170.69s/it] 18%|███████████████████████████████████▋                                                                                                                                                                 | 119/656 [5:38:58<25:24:55, 170.38s/it]                                                                                                                                                                                                                                                  {'loss': '0.8282', 'grad_norm': '0.7764', 'learning_rate': '0.0001', 'ppl': '2.289', 'memory/max_active (GiB)': '54.78', 'memory/max_allocated (GiB)': '54.78', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '49.38', 'tokens/total': 157194240, 'tokens/trainable': 19127554, 'epoch': '0.3631'}
 18%|███████████████████████████████████▋                                                                                                                                                                 | 119/656 [5:38:58<25:24:55, 170.38s/it] 18%|████████████████████████████████████                                                                                                                                                                 | 120/656 [5:41:49<25:23:36, 170.55s/it]                                                                                                                                                                                                                                                  {'loss': '0.8443', 'grad_norm': '0.7376', 'learning_rate': '0.0001', 'ppl': '2.326', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.27', 'tokens/total': 158515200, 'tokens/trainable': 19286840, 'epoch': '0.3662'}
 18%|████████████████████████████████████                                                                                                                                                                 | 120/656 [5:41:50<25:23:36, 170.55s/it] 18%|████████████████████████████████████▎                                                                                                                                                                | 121/656 [5:44:39<25:17:55, 170.23s/it]                                                                                                                                                                                                                                                  {'loss': '0.8986', 'grad_norm': '0.8092', 'learning_rate': '0.0001', 'ppl': '2.456', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.27', 'tokens/total': 159836160, 'tokens/trainable': 19428396, 'epoch': '0.3692'}
 18%|████████████████████████████████████▎                                                                                                                                                                | 121/656 [5:44:39<25:17:55, 170.23s/it] 19%|████████████████████████████████████▋                                                                                                                                                                | 122/656 [5:47:29<25:16:09, 170.35s/it]                                                                                                                                                                                                                                                  {'loss': '0.8241', 'grad_norm': '0.712', 'learning_rate': '0.0001', 'ppl': '2.28', 'memory/max_active (GiB)': '54.77', 'memory/max_allocated (GiB)': '54.77', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.15', 'tokens/total': 161157120, 'tokens/trainable': 19594608, 'epoch': '0.3723'}
 19%|████████████████████████████████████▋                                                                                                                                                                | 122/656 [5:47:29<25:16:09, 170.35s/it] 19%|████████████████████████████████████▉                                                                                                                                                                | 123/656 [5:50:19<25:10:11, 170.00s/it]                                                                                                                                                                                                                                                  {'loss': '0.86', 'grad_norm': '0.7473', 'learning_rate': '0.0001', 'ppl': '2.363', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '64.68', 'tokens/total': 162478080, 'tokens/trainable': 19754214, 'epoch': '0.3753'}
 19%|████████████████████████████████████▉                                                                                                                                                                | 123/656 [5:50:19<25:10:11, 170.00s/it] 19%|█████████████████████████████████████▏                                                                                                                                                               | 124/656 [5:53:10<25:10:40, 170.38s/it]                                                                                                                                                                                                                                                  {'loss': '0.8688', 'grad_norm': '0.7497', 'learning_rate': '0.0001', 'ppl': '2.384', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '52.13', 'tokens/total': 163799040, 'tokens/trainable': 19908898, 'epoch': '0.3784'}
 19%|█████████████████████████████████████▏                                                                                                                                                               | 124/656 [5:53:10<25:10:40, 170.38s/it] 19%|█████████████████████████████████████▌                                                                                                                                                               | 125/656 [5:56:02<25:11:10, 170.75s/it]                                                                                                                                                                                                                                                  {'loss': '0.8442', 'grad_norm': '0.7668', 'learning_rate': '0.0001', 'ppl': '2.326', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '47.69', 'tokens/total': 165120000, 'tokens/trainable': 20063632, 'epoch': '0.3814'}
 19%|█████████████████████████████████████▌                                                                                                                                                               | 125/656 [5:56:02<25:11:10, 170.75s/it] 19%|█████████████████████████████████████▊                                                                                                                                                               | 126/656 [5:58:53<25:10:11, 170.97s/it]                                                                                                                                                                                                                                                  {'loss': '0.8253', 'grad_norm': '0.8156', 'learning_rate': '0.0001', 'ppl': '2.283', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '33.26', 'tokens/total': 166440960, 'tokens/trainable': 20238112, 'epoch': '0.3845'}
 19%|█████████████████████████████████████▊                                                                                                                                                               | 126/656 [5:58:53<25:10:11, 170.97s/it] 19%|██████████████████████████████████████▏                                                                                                                                                              | 127/656 [6:01:43<25:04:05, 170.60s/it]                                                                                                                                                                                                                                                  {'loss': '0.8707', 'grad_norm': '0.7394', 'learning_rate': '0.0001', 'ppl': '2.388', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '28.41', 'tokens/total': 167761920, 'tokens/trainable': 20399492, 'epoch': '0.3875'}
 19%|██████████████████████████████████████▏                                                                                                                                                              | 127/656 [6:01:43<25:04:05, 170.60s/it] 20%|██████████████████████████████████████▍                                                                                                                                                              | 128/656 [6:04:35<25:05:19, 171.06s/it]                                                                                                                                                                                                                                                  {'loss': '0.8268', 'grad_norm': '0.6909', 'learning_rate': '0.0001', 'ppl': '2.286', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.7', 'tokens/total': 169082880, 'tokens/trainable': 20575580, 'epoch': '0.3906'}
 20%|██████████████████████████████████████▍                                                                                                                                                              | 128/656 [6:04:35<25:05:19, 171.06s/it] 20%|██████████████████████████████████████▋                                                                                                                                                              | 129/656 [6:07:27<25:04:09, 171.25s/it]                                                                                                                                                                                                                                                  {'loss': '0.8442', 'grad_norm': '0.7493', 'learning_rate': '0.0001', 'ppl': '2.326', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '29.47', 'tokens/total': 170403840, 'tokens/trainable': 20735112, 'epoch': '0.3937'}
 20%|██████████████████████████████████████▋                                                                                                                                                              | 129/656 [6:07:27<25:04:09, 171.25s/it] 20%|███████████████████████████████████████                                                                                                                                                              | 130/656 [6:10:18<25:02:19, 171.37s/it]                                                                                                                                                                                                                                                  {'loss': '0.8715', 'grad_norm': '0.7185', 'learning_rate': '0.0001', 'ppl': '2.39', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.43', 'tokens/total': 171724800, 'tokens/trainable': 20900644, 'epoch': '0.3967'}
 20%|███████████████████████████████████████                                                                                                                                                              | 130/656 [6:10:18<25:02:19, 171.37s/it] 20%|███████████████████████████████████████▎                                                                                                                                                             | 131/656 [6:13:10<25:00:04, 171.44s/it]                                                                                                                                                                                                                                                  {'loss': '0.8176', 'grad_norm': '0.6919', 'learning_rate': '0.0001', 'ppl': '2.265', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '29.44', 'tokens/total': 173045760, 'tokens/trainable': 21076484, 'epoch': '0.3998'}
 20%|███████████████████████████████████████▎                                                                                                                                                             | 131/656 [6:13:10<25:00:04, 171.44s/it] 20%|███████████████████████████████████████▋                                                                                                                                                             | 132/656 [6:16:00<24:53:39, 171.03s/it]                                                                                                                                                                                                                                                  {'loss': '0.8423', 'grad_norm': '0.7303', 'learning_rate': '0.0001', 'ppl': '2.322', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '27.77', 'tokens/total': 174366720, 'tokens/trainable': 21236548, 'epoch': '0.4028'}
 20%|███████████████████████████████████████▋                                                                                                                                                             | 132/656 [6:16:00<24:53:39, 171.03s/it] 20%|███████████████████████████████████████▉                                                                                                                                                             | 133/656 [6:18:51<24:51:14, 171.08s/it]                                                                                                                                                                                                                                                  {'loss': '0.834', 'grad_norm': '0.7189', 'learning_rate': '0.0001', 'ppl': '2.303', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '46.16', 'tokens/total': 175687680, 'tokens/trainable': 21395190, 'epoch': '0.4059'}
 20%|███████████████████████████████████████▉                                                                                                                                                             | 133/656 [6:18:51<24:51:14, 171.08s/it] 20%|████████████████████████████████████████▏                                                                                                                                                            | 134/656 [6:21:42<24:49:01, 171.15s/it]                                                                                                                                                                                                                                                  {'loss': '0.8688', 'grad_norm': '0.7404', 'learning_rate': '0.0001', 'ppl': '2.384', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '68.93', 'tokens/total': 177008640, 'tokens/trainable': 21553208, 'epoch': '0.4089'}
 20%|████████████████████████████████████████▏                                                                                                                                                            | 134/656 [6:21:42<24:49:01, 171.15s/it] 21%|████████████████████████████████████████▌                                                                                                                                                            | 135/656 [6:24:33<24:45:21, 171.06s/it]                                                                                                                                                                                                                                                  {'loss': '0.8765', 'grad_norm': '0.7201', 'learning_rate': '0.0001', 'ppl': '2.403', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '34.49', 'tokens/total': 178329600, 'tokens/trainable': 21728704, 'epoch': '0.412'}
 21%|████████████████████████████████████████▌                                                                                                                                                            | 135/656 [6:24:33<24:45:21, 171.06s/it] 21%|████████████████████████████████████████▊                                                                                                                                                            | 136/656 [6:27:22<24:35:43, 170.28s/it]                                                                                                                                                                                                                                                  {'loss': '0.8458', 'grad_norm': '0.7203', 'learning_rate': '0.0001', 'ppl': '2.33', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '76.6', 'tokens/total': 179650560, 'tokens/trainable': 21888584, 'epoch': '0.415'}
 21%|████████████████████████████████████████▊                                                                                                                                                            | 136/656 [6:27:22<24:35:43, 170.28s/it] 21%|█████████████████████████████████████████▏                                                                                                                                                           | 137/656 [6:30:11<24:30:34, 170.01s/it]                                                                                                                                                                                                                                                  {'loss': '0.8152', 'grad_norm': '0.7674', 'learning_rate': '0.0001', 'ppl': '2.26', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '53.29', 'tokens/total': 180971520, 'tokens/trainable': 22033612, 'epoch': '0.4181'}
 21%|█████████████████████████████████████████▏                                                                                                                                                           | 137/656 [6:30:11<24:30:34, 170.01s/it] 21%|█████████████████████████████████████████▍                                                                                                                                                           | 138/656 [6:33:01<24:28:08, 170.05s/it]                                                                                                                                                                                                                                                  {'loss': '0.8696', 'grad_norm': '0.7283', 'learning_rate': '0.0001', 'ppl': '2.386', 'memory/max_active (GiB)': '54.77', 'memory/max_allocated (GiB)': '54.77', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '23.74', 'tokens/total': 182292480, 'tokens/trainable': 22186504, 'epoch': '0.4211'}
 21%|█████████████████████████████████████████▍                                                                                                                                                           | 138/656 [6:33:01<24:28:08, 170.05s/it] 21%|█████████████████████████████████████████▋                                                                                                                                                           | 139/656 [6:35:51<24:24:02, 169.91s/it]                                                                                                                                                                                                                                                  {'loss': '0.8341', 'grad_norm': '0.7283', 'learning_rate': '0.0001', 'ppl': '2.303', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '39.33', 'tokens/total': 183613440, 'tokens/trainable': 22349948, 'epoch': '0.4242'}
 21%|█████████████████████████████████████████▋                                                                                                                                                           | 139/656 [6:35:51<24:24:02, 169.91s/it] 21%|██████████████████████████████████████████                                                                                                                                                           | 140/656 [6:38:41<24:20:55, 169.87s/it]                                                                                                                                                                                                                                                  {'loss': '0.8271', 'grad_norm': '0.7396', 'learning_rate': '0.0001', 'ppl': '2.287', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '52.53', 'tokens/total': 184934400, 'tokens/trainable': 22500620, 'epoch': '0.4272'}
 21%|██████████████████████████████████████████                                                                                                                                                           | 140/656 [6:38:41<24:20:55, 169.87s/it] 21%|██████████████████████████████████████████▎                                                                                                                                                          | 141/656 [6:41:30<24:16:19, 169.67s/it]                                                                                                                                                                                                                                                  {'loss': '0.8476', 'grad_norm': '0.7811', 'learning_rate': '0.0001', 'ppl': '2.334', 'memory/max_active (GiB)': '54.78', 'memory/max_allocated (GiB)': '54.78', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '45.07', 'tokens/total': 186255360, 'tokens/trainable': 22644808, 'epoch': '0.4303'}
 21%|██████████████████████████████████████████▎                                                                                                                                                          | 141/656 [6:41:30<24:16:19, 169.67s/it] 22%|██████████████████████████████████████████▋                                                                                                                                                          | 142/656 [6:44:20<24:15:35, 169.91s/it]                                                                                                                                                                                                                                                  {'loss': '0.8839', 'grad_norm': '0.7261', 'learning_rate': '0.0001', 'ppl': '2.42', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '62.07', 'tokens/total': 187576320, 'tokens/trainable': 22807932, 'epoch': '0.4333'}
 22%|██████████████████████████████████████████▋                                                                                                                                                          | 142/656 [6:44:20<24:15:35, 169.91s/it] 22%|██████████████████████████████████████████▉                                                                                                                                                          | 143/656 [6:47:10<24:11:09, 169.73s/it]                                                                                                                                                                                                                                                  {'loss': '0.7685', 'grad_norm': '0.6574', 'learning_rate': '0.0001', 'ppl': '2.156', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '52.42', 'tokens/total': 188897280, 'tokens/trainable': 22988884, 'epoch': '0.4364'}
 22%|██████████████████████████████████████████▉                                                                                                                                                          | 143/656 [6:47:10<24:11:09, 169.73s/it] 22%|███████████████████████████████████████████▏                                                                                                                                                         | 144/656 [6:50:00<24:11:10, 170.06s/it]                                                                                                                                                                                                                                                  {'loss': '0.8103', 'grad_norm': '0.7294', 'learning_rate': '0.0001', 'ppl': '2.249', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.68', 'tokens/total': 190218240, 'tokens/trainable': 23155904, 'epoch': '0.4394'}
 22%|███████████████████████████████████████████▏                                                                                                                                                         | 144/656 [6:50:00<24:11:10, 170.06s/it] 22%|███████████████████████████████████████████▌                                                                                                                                                         | 145/656 [6:52:54<24:16:28, 171.02s/it]                                                                                                                                                                                                                                                  {'loss': '0.8344', 'grad_norm': '0.7044', 'learning_rate': '0.0001', 'ppl': '2.303', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '42.07', 'tokens/total': 191539200, 'tokens/trainable': 23329584, 'epoch': '0.4425'}
 22%|███████████████████████████████████████████▌                                                                                                                                                         | 145/656 [6:52:54<24:16:28, 171.02s/it] 22%|███████████████████████████████████████████▊                                                                                                                                                         | 146/656 [6:55:43<24:09:15, 170.50s/it]                                                                                                                                                                                                                                                  {'loss': '0.7987', 'grad_norm': '0.6958', 'learning_rate': '0.0001', 'ppl': '2.223', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '27.13', 'tokens/total': 192860160, 'tokens/trainable': 23503852, 'epoch': '0.4455'}
 22%|███████████████████████████████████████████▊                                                                                                                                                         | 146/656 [6:55:43<24:09:15, 170.50s/it] 22%|████████████████████████████████████████████▏                                                                                                                                                        | 147/656 [6:58:34<24:08:49, 170.78s/it]                                                                                                                                                                                                                                                  {'loss': '0.8531', 'grad_norm': '0.7434', 'learning_rate': '0.0001', 'ppl': '2.347', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '33.93', 'tokens/total': 194181120, 'tokens/trainable': 23651400, 'epoch': '0.4486'}
 22%|████████████████████████████████████████████▏                                                                                                                                                        | 147/656 [6:58:34<24:08:49, 170.78s/it] 23%|████████████████████████████████████████████▍                                                                                                                                                        | 148/656 [7:01:24<24:03:37, 170.51s/it]                                                                                                                                                                                                                                                  {'loss': '0.823', 'grad_norm': '0.6772', 'learning_rate': '0.0001', 'ppl': '2.277', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '42.32', 'tokens/total': 195502080, 'tokens/trainable': 23827280, 'epoch': '0.4516'}
 23%|████████████████████████████████████████████▍                                                                                                                                                        | 148/656 [7:01:24<24:03:37, 170.51s/it] 23%|████████████████████████████████████████████▋                                                                                                                                                        | 149/656 [7:04:15<24:00:28, 170.47s/it]                                                                                                                                                                                                                                                  {'loss': '0.8552', 'grad_norm': '0.7105', 'learning_rate': '0.0001', 'ppl': '2.352', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '58.57', 'tokens/total': 196823040, 'tokens/trainable': 23987480, 'epoch': '0.4547'}
 23%|████████████████████████████████████████████▋                                                                                                                                                        | 149/656 [7:04:15<24:00:28, 170.47s/it] 23%|█████████████████████████████████████████████                                                                                                                                                        | 150/656 [7:07:04<23:55:22, 170.20s/it]                                                                                                                                                                                                                                                  {'loss': '0.8675', 'grad_norm': '0.7662', 'learning_rate': '0.0001', 'ppl': '2.381', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '47.18', 'tokens/total': 198144000, 'tokens/trainable': 24139612, 'epoch': '0.4577'}
 23%|█████████████████████████████████████████████                                                                                                                                                        | 150/656 [7:07:04<23:55:22, 170.20s/it] 23%|█████████████████████████████████████████████▎                                                                                                                                                       | 151/656 [7:09:57<23:58:57, 170.97s/it]                                                                                                                                                                                                                                                  {'loss': '0.8411', 'grad_norm': '0.8499', 'learning_rate': '0.0001', 'ppl': '2.319', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '53.77', 'tokens/total': 199464960, 'tokens/trainable': 24316304, 'epoch': '0.4608'}
 23%|█████████████████████████████████████████████▎                                                                                                                                                       | 151/656 [7:09:57<23:58:57, 170.97s/it] 23%|█████████████████████████████████████████████▋                                                                                                                                                       | 152/656 [7:12:47<23:53:01, 170.60s/it]                                                                                                                                                                                                                                                  {'loss': '0.818', 'grad_norm': '0.7364', 'learning_rate': '0.0001', 'ppl': '2.266', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.4', 'tokens/total': 200785920, 'tokens/trainable': 24464652, 'epoch': '0.4638'}
 23%|█████████████████████████████████████████████▋                                                                                                                                                       | 152/656 [7:12:47<23:53:01, 170.60s/it] 23%|█████████████████████████████████████████████▉                                                                                                                                                       | 153/656 [7:15:37<23:49:24, 170.51s/it]                                                                                                                                                                                                                                                  {'loss': '0.8644', 'grad_norm': '0.7223', 'learning_rate': '0.0001', 'ppl': '2.373', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '47.97', 'tokens/total': 202106880, 'tokens/trainable': 24630380, 'epoch': '0.4669'}
 23%|█████████████████████████████████████████████▉                                                                                                                                                       | 153/656 [7:15:37<23:49:24, 170.51s/it] 23%|██████████████████████████████████████████████▏                                                                                                                                                      | 154/656 [7:18:28<23:47:02, 170.56s/it]                                                                                                                                                                                                                                                  {'loss': '0.8749', 'grad_norm': '0.734', 'learning_rate': '0.0001', 'ppl': '2.399', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '83.32', 'tokens/total': 203427840, 'tokens/trainable': 24787372, 'epoch': '0.4699'}
 23%|██████████████████████████████████████████████▏                                                                                                                                                      | 154/656 [7:18:28<23:47:02, 170.56s/it] 24%|██████████████████████████████████████████████▌                                                                                                                                                      | 155/656 [7:21:18<23:43:36, 170.49s/it]                                                                                                                                                                                                                                                  {'loss': '0.8418', 'grad_norm': '0.7635', 'learning_rate': '0.0001', 'ppl': '2.321', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '37.27', 'tokens/total': 204748800, 'tokens/trainable': 24950252, 'epoch': '0.473'}
 24%|██████████████████████████████████████████████▌                                                                                                                                                      | 155/656 [7:21:18<23:43:36, 170.49s/it] 24%|██████████████████████████████████████████████▊                                                                                                                                                      | 156/656 [7:24:09<23:41:11, 170.54s/it]                                                                                                                                                                                                                                                  {'loss': '0.8793', 'grad_norm': '0.8123', 'learning_rate': '0.0001', 'ppl': '2.409', 'memory/max_active (GiB)': '54.93', 'memory/max_allocated (GiB)': '54.93', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '38.83', 'tokens/total': 206069760, 'tokens/trainable': 25096566, 'epoch': '0.476'}
 24%|██████████████████████████████████████████████▊                                                                                                                                                      | 156/656 [7:24:09<23:41:11, 170.54s/it] 24%|███████████████████████████████████████████████▏                                                                                                                                                     | 157/656 [7:26:58<23:35:04, 170.15s/it]                                                                                                                                                                                                                                                  {'loss': '0.8607', 'grad_norm': '0.7926', 'learning_rate': '0.0001', 'ppl': '2.365', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '49.17', 'tokens/total': 207390720, 'tokens/trainable': 25234740, 'epoch': '0.4791'}
 24%|███████████████████████████████████████████████▏                                                                                                                                                     | 157/656 [7:26:58<23:35:04, 170.15s/it] 24%|███████████████████████████████████████████████▍                                                                                                                                                     | 158/656 [7:29:50<23:37:03, 170.73s/it]                                                                                                                                                                                                                                                  {'loss': '0.8305', 'grad_norm': '0.7292', 'learning_rate': '0.0001', 'ppl': '2.294', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '31.8', 'tokens/total': 208711680, 'tokens/trainable': 25387788, 'epoch': '0.4821'}
 24%|███████████████████████████████████████████████▍                                                                                                                                                     | 158/656 [7:29:50<23:37:03, 170.73s/it] 24%|███████████████████████████████████████████████▋                                                                                                                                                     | 159/656 [7:32:41<23:34:39, 170.78s/it]                                                                                                                                                                                                                                                  {'loss': '0.8587', 'grad_norm': '0.7869', 'learning_rate': '0.0001', 'ppl': '2.36', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.96', 'tokens/total': 210032640, 'tokens/trainable': 25531310, 'epoch': '0.4852'}
 24%|███████████████████████████████████████████████▋                                                                                                                                                     | 159/656 [7:32:41<23:34:39, 170.78s/it] 24%|████████████████████████████████████████████████                                                                                                                                                     | 160/656 [7:35:34<23:38:18, 171.57s/it]                                                                                                                                                                                                                                                  {'loss': '0.8654', 'grad_norm': '0.7104', 'learning_rate': '0.0001', 'ppl': '2.376', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '38.72', 'tokens/total': 211353600, 'tokens/trainable': 25697180, 'epoch': '0.4883'}
 24%|████████████████████████████████████████████████                                                                                                                                                     | 160/656 [7:35:34<23:38:18, 171.57s/it] 25%|████████████████████████████████████████████████▎                                                                                                                                                    | 161/656 [7:38:24<23:30:11, 170.93s/it]                                                                                                                                                                                                                                                  {'loss': '0.9033', 'grad_norm': '0.7509', 'learning_rate': '0.0001', 'ppl': '2.468', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.3', 'tokens/total': 212674560, 'tokens/trainable': 25857628, 'epoch': '0.4913'}
 25%|████████████████████████████████████████████████▎                                                                                                                                                    | 161/656 [7:38:24<23:30:11, 170.93s/it] 25%|████████████████████████████████████████████████▋                                                                                                                                                    | 162/656 [7:41:13<23:22:49, 170.38s/it]                                                                                                                                                                                                                                                  {'loss': '0.8367', 'grad_norm': '0.7461', 'learning_rate': '0.0001', 'ppl': '2.309', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.05', 'tokens/total': 213995520, 'tokens/trainable': 26012656, 'epoch': '0.4944'}
 25%|████████████████████████████████████████████████▋                                                                                                                                                    | 162/656 [7:41:13<23:22:49, 170.38s/it] 25%|████████████████████████████████████████████████▉                                                                                                                                                    | 163/656 [7:44:04<23:20:58, 170.50s/it]                                                                                                                                                                                                                                                  {'loss': '0.8126', 'grad_norm': '0.7128', 'learning_rate': '0.0001', 'ppl': '2.254', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '38.68', 'tokens/total': 215316480, 'tokens/trainable': 26184316, 'epoch': '0.4974'}
 25%|████████████████████████████████████████████████▉                                                                                                                                                    | 163/656 [7:44:04<23:20:58, 170.50s/it] 25%|█████████████████████████████████████████████████▎                                                                                                                                                   | 164/656 [7:46:55<23:21:10, 170.88s/it]                                                                                                                                                                                                                                                  {'loss': '0.8337', 'grad_norm': '1.223', 'learning_rate': '0.0001', 'ppl': '2.302', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '37.57', 'tokens/total': 216637440, 'tokens/trainable': 26345020, 'epoch': '0.5005'}
 25%|█████████████████████████████████████████████████▎                                                                                                                                                   | 164/656 [7:46:55<23:21:10, 170.88s/it] 25%|█████████████████████████████████████████████████▌                                                                                                                                                   | 165/656 [7:49:47<23:18:56, 170.95s/it]                                                                                                                                                                                                                                                  {'loss': '0.7892', 'grad_norm': '0.6662', 'learning_rate': '0.0001', 'ppl': '2.202', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '58.27', 'tokens/total': 217958400, 'tokens/trainable': 26526994, 'epoch': '0.5035'}
 25%|█████████████████████████████████████████████████▌                                                                                                                                                   | 165/656 [7:49:47<23:18:56, 170.95s/it] 25%|█████████████████████████████████████████████████▊                                                                                                                                                   | 166/656 [7:52:40<23:21:11, 171.58s/it]                                                                                                                                                                                                                                                  {'loss': '0.8002', 'grad_norm': '0.6855', 'learning_rate': '0.0001', 'ppl': '2.226', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.75', 'tokens/total': 219279360, 'tokens/trainable': 26705004, 'epoch': '0.5066'}
 25%|█████████████████████████████████████████████████▊                                                                                                                                                   | 166/656 [7:52:40<23:21:11, 171.58s/it] 25%|██████████████████████████████████████████████████▏                                                                                                                                                  | 167/656 [7:55:31<23:18:55, 171.65s/it]                                                                                                                                                                                                                                                  {'loss': '0.83', 'grad_norm': '0.7145', 'learning_rate': '0.0001', 'ppl': '2.293', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '57.51', 'tokens/total': 220600320, 'tokens/trainable': 26870068, 'epoch': '0.5096'}
 25%|██████████████████████████████████████████████████▏                                                                                                                                                  | 167/656 [7:55:31<23:18:55, 171.65s/it] 26%|██████████████████████████████████████████████████▍                                                                                                                                                  | 168/656 [7:58:20<23:08:14, 170.68s/it]                                                                                                                                                                                                                                                  {'loss': '0.8563', 'grad_norm': '0.7244', 'learning_rate': '0.0001', 'ppl': '2.355', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '46.66', 'tokens/total': 221921280, 'tokens/trainable': 27032544, 'epoch': '0.5127'}
 26%|██████████████████████████████████████████████████▍                                                                                                                                                  | 168/656 [7:58:20<23:08:14, 170.68s/it] 26%|██████████████████████████████████████████████████▊                                                                                                                                                  | 169/656 [8:01:10<23:04:20, 170.56s/it]                                                                                                                                                                                                                                                  {'loss': '0.8306', 'grad_norm': '0.7782', 'learning_rate': '0.0001', 'ppl': '2.295', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '101.8', 'tokens/total': 223242240, 'tokens/trainable': 27198024, 'epoch': '0.5157'}
 26%|██████████████████████████████████████████████████▊                                                                                                                                                  | 169/656 [8:01:10<23:04:20, 170.56s/it] 26%|███████████████████████████████████████████████████                                                                                                                                                  | 170/656 [8:04:01<23:02:57, 170.74s/it]                                                                                                                                                                                                                                                  {'loss': '0.8366', 'grad_norm': '0.9359', 'learning_rate': '0.0001', 'ppl': '2.309', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '30.57', 'tokens/total': 224563200, 'tokens/trainable': 27369084, 'epoch': '0.5188'}
 26%|███████████████████████████████████████████████████                                                                                                                                                  | 170/656 [8:04:01<23:02:57, 170.74s/it] 26%|███████████████████████████████████████████████████▎                                                                                                                                                 | 171/656 [8:06:53<23:03:37, 171.17s/it]                                                                                                                                                                                                                                                  {'loss': '0.79', 'grad_norm': '0.6766', 'learning_rate': '0.0001', 'ppl': '2.203', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '52.95', 'tokens/total': 225884160, 'tokens/trainable': 27542932, 'epoch': '0.5218'}
 26%|███████████████████████████████████████████████████▎                                                                                                                                                 | 171/656 [8:06:53<23:03:37, 171.17s/it] 26%|███████████████████████████████████████████████████▋                                                                                                                                                 | 172/656 [8:09:44<22:59:28, 171.01s/it]                                                                                                                                                                                                                                                  {'loss': '0.8185', 'grad_norm': '0.8539', 'learning_rate': '0.0001', 'ppl': '2.267', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '42.04', 'tokens/total': 227205120, 'tokens/trainable': 27716508, 'epoch': '0.5249'}
 26%|███████████████████████████████████████████████████▋                                                                                                                                                 | 172/656 [8:09:44<22:59:28, 171.01s/it] 26%|███████████████████████████████████████████████████▉                                                                                                                                                 | 173/656 [8:12:34<22:53:16, 170.59s/it]                                                                                                                                                                                                                                                  {'loss': '0.8186', 'grad_norm': '0.7368', 'learning_rate': '0.0001', 'ppl': '2.267', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '17.2', 'tokens/total': 228526080, 'tokens/trainable': 27874752, 'epoch': '0.5279'}
 26%|███████████████████████████████████████████████████▉                                                                                                                                                 | 173/656 [8:12:34<22:53:16, 170.59s/it] 27%|████████████████████████████████████████████████████▎                                                                                                                                                | 174/656 [8:15:25<22:51:26, 170.72s/it]                                                                                                                                                                                                                                                  {'loss': '0.8137', 'grad_norm': '0.7164', 'learning_rate': '0.0001', 'ppl': '2.256', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.93', 'tokens/total': 229847040, 'tokens/trainable': 28036784, 'epoch': '0.531'}
 27%|████████████████████████████████████████████████████▎                                                                                                                                                | 174/656 [8:15:25<22:51:26, 170.72s/it] 27%|████████████████████████████████████████████████████▌                                                                                                                                                | 175/656 [8:18:15<22:48:14, 170.67s/it]                                                                                                                                                                                                                                                  {'loss': '0.8032', 'grad_norm': '0.7157', 'learning_rate': '0.0001', 'ppl': '2.233', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '56.39', 'tokens/total': 231168000, 'tokens/trainable': 28195028, 'epoch': '0.534'}
 27%|████████████████████████████████████████████████████▌                                                                                                                                                | 175/656 [8:18:15<22:48:14, 170.67s/it] 27%|████████████████████████████████████████████████████▊                                                                                                                                                | 176/656 [8:21:06<22:46:05, 170.76s/it]                                                                                                                                                                                                                                                  {'loss': '0.8245', 'grad_norm': '0.7469', 'learning_rate': '0.0001', 'ppl': '2.281', 'memory/max_active (GiB)': '54.91', 'memory/max_allocated (GiB)': '54.91', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '46.17', 'tokens/total': 232488960, 'tokens/trainable': 28345672, 'epoch': '0.5371'}
 27%|████████████████████████████████████████████████████▊                                                                                                                                                | 176/656 [8:21:06<22:46:05, 170.76s/it] 27%|█████████████████████████████████████████████████████▏                                                                                                                                               | 177/656 [8:23:55<22:37:54, 170.09s/it]                                                                                                                                                                                                                                                  {'loss': '0.802', 'grad_norm': '0.6732', 'learning_rate': '0.0001', 'ppl': '2.23', 'memory/max_active (GiB)': '54.77', 'memory/max_allocated (GiB)': '54.77', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '53.37', 'tokens/total': 233809920, 'tokens/trainable': 28519216, 'epoch': '0.5401'}
 27%|█████████████████████████████████████████████████████▏                                                                                                                                               | 177/656 [8:23:55<22:37:54, 170.09s/it] 27%|█████████████████████████████████████████████████████▍                                                                                                                                               | 178/656 [8:26:45<22:36:09, 170.23s/it]                                                                                                                                                                                                                                                  {'loss': '0.8098', 'grad_norm': '0.7008', 'learning_rate': '0.0001', 'ppl': '2.247', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '51.57', 'tokens/total': 235130880, 'tokens/trainable': 28686000, 'epoch': '0.5432'}
 27%|█████████████████████████████████████████████████████▍                                                                                                                                               | 178/656 [8:26:45<22:36:09, 170.23s/it] 27%|█████████████████████████████████████████████████████▊                                                                                                                                               | 179/656 [8:29:35<22:31:28, 170.00s/it]                                                                                                                                                                                                                                                  {'loss': '0.8926', 'grad_norm': '0.6986', 'learning_rate': '0.0001', 'ppl': '2.442', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.6', 'tokens/total': 236451840, 'tokens/trainable': 28853716, 'epoch': '0.5462'}
 27%|█████████████████████████████████████████████████████▊                                                                                                                                               | 179/656 [8:29:35<22:31:28, 170.00s/it] 27%|██████████████████████████████████████████████████████                                                                                                                                               | 180/656 [8:32:25<22:29:20, 170.08s/it]                                                                                                                                                                                                                                                  {'loss': '0.8101', 'grad_norm': '0.9622', 'learning_rate': '0.0001', 'ppl': '2.248', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '28.84', 'tokens/total': 237772800, 'tokens/trainable': 29020644, 'epoch': '0.5493'}
 27%|██████████████████████████████████████████████████████                                                                                                                                               | 180/656 [8:32:25<22:29:20, 170.08s/it] 28%|██████████████████████████████████████████████████████▎                                                                                                                                              | 181/656 [8:35:15<22:26:22, 170.07s/it]                                                                                                                                                                                                                                                  {'loss': '0.8739', 'grad_norm': '0.7429', 'learning_rate': '0.0001', 'ppl': '2.396', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '60.91', 'tokens/total': 239093760, 'tokens/trainable': 29168294, 'epoch': '0.5523'}
 28%|██████████████████████████████████████████████████████▎                                                                                                                                              | 181/656 [8:35:15<22:26:22, 170.07s/it] 28%|██████████████████████████████████████████████████████▋                                                                                                                                              | 182/656 [8:38:06<22:25:19, 170.30s/it]                                                                                                                                                                                                                                                  {'loss': '0.8267', 'grad_norm': '0.7066', 'learning_rate': '0.0001', 'ppl': '2.286', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.98', 'tokens/total': 240414720, 'tokens/trainable': 29325498, 'epoch': '0.5554'}
 28%|██████████████████████████████████████████████████████▋                                                                                                                                              | 182/656 [8:38:06<22:25:19, 170.30s/it] 28%|██████████████████████████████████████████████████████▉                                                                                                                                              | 183/656 [8:40:58<22:27:02, 170.87s/it]                                                                                                                                                                                                                                                  {'loss': '0.7901', 'grad_norm': '0.668', 'learning_rate': '0.0001', 'ppl': '2.204', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.71', 'tokens/total': 241735680, 'tokens/trainable': 29510796, 'epoch': '0.5584'}
 28%|██████████████████████████████████████████████████████▉                                                                                                                                              | 183/656 [8:40:58<22:27:02, 170.87s/it] 28%|███████████████████████████████████████████████████████▎                                                                                                                                             | 184/656 [8:43:49<22:24:08, 170.87s/it]                                                                                                                                                                                                                                                  {'loss': '0.8517', 'grad_norm': '0.7446', 'learning_rate': '0.0001', 'ppl': '2.344', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.79', 'tokens/total': 243056640, 'tokens/trainable': 29656558, 'epoch': '0.5615'}
 28%|███████████████████████████████████████████████████████▎                                                                                                                                             | 184/656 [8:43:49<22:24:08, 170.87s/it] 28%|███████████████████████████████████████████████████████▌                                                                                                                                             | 185/656 [8:46:39<22:20:17, 170.74s/it]                                                                                                                                                                                                                                                  {'loss': '0.8518', 'grad_norm': '0.7332', 'learning_rate': '0.0001', 'ppl': '2.344', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '28.33', 'tokens/total': 244377600, 'tokens/trainable': 29809904, 'epoch': '0.5645'}
 28%|███████████████████████████████████████████████████████▌                                                                                                                                             | 185/656 [8:46:39<22:20:17, 170.74s/it] 28%|███████████████████████████████████████████████████████▊                                                                                                                                             | 186/656 [8:49:31<22:19:50, 171.04s/it]                                                                                                                                                                                                                                                  {'loss': '0.8431', 'grad_norm': '0.6934', 'learning_rate': '0.0001', 'ppl': '2.323', 'memory/max_active (GiB)': '54.91', 'memory/max_allocated (GiB)': '54.91', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.55', 'tokens/total': 245698560, 'tokens/trainable': 29978960, 'epoch': '0.5676'}
 28%|███████████████████████████████████████████████████████▊                                                                                                                                             | 186/656 [8:49:31<22:19:50, 171.04s/it] 29%|████████████████████████████████████████████████████████▏                                                                                                                                            | 187/656 [8:52:22<22:16:21, 170.96s/it]                                                                                                                                                                                                                                                  {'loss': '0.812', 'grad_norm': '0.6788', 'learning_rate': '0.0001', 'ppl': '2.252', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '20.99', 'tokens/total': 247019520, 'tokens/trainable': 30148296, 'epoch': '0.5706'}
 29%|████████████████████████████████████████████████████████▏                                                                                                                                            | 187/656 [8:52:22<22:16:21, 170.96s/it] 29%|████████████████████████████████████████████████████████▍                                                                                                                                            | 188/656 [8:55:13<22:13:32, 170.97s/it]                                                                                                                                                                                                                                                  {'loss': '0.8524', 'grad_norm': '0.7625', 'learning_rate': '0.0001', 'ppl': '2.345', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '37.28', 'tokens/total': 248340480, 'tokens/trainable': 30289588, 'epoch': '0.5737'}
 29%|████████████████████████████████████████████████████████▍                                                                                                                                            | 188/656 [8:55:13<22:13:32, 170.97s/it] 29%|████████████████████████████████████████████████████████▊                                                                                                                                            | 189/656 [8:58:05<22:12:16, 171.17s/it]                                                                                                                                                                                                                                                  {'loss': '0.8405', 'grad_norm': '0.7262', 'learning_rate': '0.0001', 'ppl': '2.317', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '32.4', 'tokens/total': 249661440, 'tokens/trainable': 30445792, 'epoch': '0.5767'}
 29%|████████████████████████████████████████████████████████▊                                                                                                                                            | 189/656 [8:58:05<22:12:16, 171.17s/it] 29%|█████████████████████████████████████████████████████████                                                                                                                                            | 190/656 [9:00:55<22:06:46, 170.83s/it]                                                                                                                                                                                                                                                  {'loss': '0.8054', 'grad_norm': '0.7221', 'learning_rate': '0.0001', 'ppl': '2.238', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '39.04', 'tokens/total': 250982400, 'tokens/trainable': 30597022, 'epoch': '0.5798'}
 29%|█████████████████████████████████████████████████████████                                                                                                                                            | 190/656 [9:00:55<22:06:46, 170.83s/it] 29%|█████████████████████████████████████████████████████████▎                                                                                                                                           | 191/656 [9:03:46<22:05:29, 171.03s/it]                                                                                                                                                                                                                                                  {'loss': '0.879', 'grad_norm': '0.7664', 'learning_rate': '0.0001', 'ppl': '2.408', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '26.44', 'tokens/total': 252303360, 'tokens/trainable': 30787784, 'epoch': '0.5829'}
 29%|█████████████████████████████████████████████████████████▎                                                                                                                                           | 191/656 [9:03:46<22:05:29, 171.03s/it] 29%|█████████████████████████████████████████████████████████▋                                                                                                                                           | 192/656 [9:06:37<22:03:26, 171.13s/it]                                                                                                                                                                                                                                                  {'loss': '0.8817', 'grad_norm': '0.676', 'learning_rate': '0.0001', 'ppl': '2.415', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '77.27', 'tokens/total': 253624320, 'tokens/trainable': 30966872, 'epoch': '0.5859'}
 29%|█████████████████████████████████████████████████████████▋                                                                                                                                           | 192/656 [9:06:37<22:03:26, 171.13s/it] 29%|█████████████████████████████████████████████████████████▉                                                                                                                                           | 193/656 [9:09:27<21:57:13, 170.70s/it]                                                                                                                                                                                                                                                  {'loss': '0.8425', 'grad_norm': '0.753', 'learning_rate': '0.0001', 'ppl': '2.322', 'memory/max_active (GiB)': '54.78', 'memory/max_allocated (GiB)': '54.78', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '24.39', 'tokens/total': 254945280, 'tokens/trainable': 31112816, 'epoch': '0.589'}
 29%|█████████████████████████████████████████████████████████▉                                                                                                                                           | 193/656 [9:09:27<21:57:13, 170.70s/it] 30%|██████████████████████████████████████████████████████████▎                                                                                                                                          | 194/656 [9:12:19<21:56:04, 170.92s/it]                                                                                                                                                                                                                                                  {'loss': '0.7902', 'grad_norm': '0.6931', 'learning_rate': '0.0001', 'ppl': '2.204', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '57.78', 'tokens/total': 256266240, 'tokens/trainable': 31275980, 'epoch': '0.592'}
 30%|██████████████████████████████████████████████████████████▎                                                                                                                                          | 194/656 [9:12:19<21:56:04, 170.92s/it] 30%|██████████████████████████████████████████████████████████▌                                                                                                                                          | 195/656 [9:15:10<21:55:11, 171.18s/it]                                                                                                                                                                                                                                                  {'loss': '0.8143', 'grad_norm': '0.7114', 'learning_rate': '0.0001', 'ppl': '2.258', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '48.71', 'tokens/total': 257587200, 'tokens/trainable': 31445876, 'epoch': '0.5951'}
 30%|██████████████████████████████████████████████████████████▌                                                                                                                                          | 195/656 [9:15:10<21:55:11, 171.18s/it] 30%|██████████████████████████████████████████████████████████▊                                                                                                                                          | 196/656 [9:17:59<21:45:28, 170.28s/it]                                                                                                                                                                                                                                                  {'loss': '0.8572', 'grad_norm': '0.7617', 'learning_rate': '0.0001', 'ppl': '2.357', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '48.22', 'tokens/total': 258908160, 'tokens/trainable': 31586656, 'epoch': '0.5981'}
 30%|██████████████████████████████████████████████████████████▊                                                                                                                                          | 196/656 [9:17:59<21:45:28, 170.28s/it] 30%|███████████████████████████████████████████████████████████▏                                                                                                                                         | 197/656 [9:20:50<21:45:08, 170.61s/it]                                                                                                                                                                                                                                                  {'loss': '0.8677', 'grad_norm': '0.7359', 'learning_rate': '0.0001', 'ppl': '2.381', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '51.07', 'tokens/total': 260229120, 'tokens/trainable': 31736960, 'epoch': '0.6012'}
 30%|███████████████████████████████████████████████████████████▏                                                                                                                                         | 197/656 [9:20:50<21:45:08, 170.61s/it] 30%|███████████████████████████████████████████████████████████▍                                                                                                                                         | 198/656 [9:23:41<21:43:13, 170.73s/it]                                                                                                                                                                                                                                                  {'loss': '0.7981', 'grad_norm': '0.7204', 'learning_rate': '0.0001', 'ppl': '2.221', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '59.06', 'tokens/total': 261550080, 'tokens/trainable': 31899896, 'epoch': '0.6042'}
 30%|███████████████████████████████████████████████████████████▍                                                                                                                                         | 198/656 [9:23:41<21:43:13, 170.73s/it] 30%|███████████████████████████████████████████████████████████▊                                                                                                                                         | 199/656 [9:26:32<21:42:08, 170.96s/it]                                                                                                                                                                                                                                                  {'loss': '0.8132', 'grad_norm': '0.7127', 'learning_rate': '0.0001', 'ppl': '2.255', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '32.63', 'tokens/total': 262871040, 'tokens/trainable': 32057952, 'epoch': '0.6073'}
 30%|███████████████████████████████████████████████████████████▊                                                                                                                                         | 199/656 [9:26:32<21:42:08, 170.96s/it] 30%|████████████████████████████████████████████████████████████                                                                                                                                         | 200/656 [9:29:25<21:41:52, 171.30s/it]                                                                                                                                                                                                                                                  {'loss': '0.8583', 'grad_norm': '0.7204', 'learning_rate': '0.0001', 'ppl': '2.359', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '54.21', 'tokens/total': 264192000, 'tokens/trainable': 32226212, 'epoch': '0.6103'}
 30%|████████████████████████████████████████████████████████████                                                                                                                                         | 200/656 [9:29:25<21:41:52, 171.30s/it] 31%|████████████████████████████████████████████████████████████▎                                                                                                                                        | 201/656 [9:32:14<21:34:13, 170.67s/it]                                                                                                                                                                                                                                                  {'loss': '0.8587', 'grad_norm': '0.7728', 'learning_rate': '0.0001', 'ppl': '2.36', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '54.09', 'tokens/total': 265512960, 'tokens/trainable': 32366956, 'epoch': '0.6134'}
 31%|████████████████████████████████████████████████████████████▎                                                                                                                                        | 201/656 [9:32:14<21:34:13, 170.67s/it] 31%|████████████████████████████████████████████████████████████▋                                                                                                                                        | 202/656 [9:35:04<21:29:31, 170.42s/it]                                                                                                                                                                                                                                                  {'loss': '0.914', 'grad_norm': '0.7276', 'learning_rate': '0.0001', 'ppl': '2.494', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '53.56', 'tokens/total': 266833920, 'tokens/trainable': 32518672, 'epoch': '0.6164'}
 31%|████████████████████████████████████████████████████████████▋                                                                                                                                        | 202/656 [9:35:04<21:29:31, 170.42s/it] 31%|████████████████████████████████████████████████████████████▉                                                                                                                                        | 203/656 [9:37:55<21:30:04, 170.87s/it]                                                                                                                                                                                                                                                  {'loss': '0.8185', 'grad_norm': '0.6919', 'learning_rate': '0.0001', 'ppl': '2.267', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.86', 'tokens/total': 268154880, 'tokens/trainable': 32684654, 'epoch': '0.6195'}
 31%|████████████████████████████████████████████████████████████▉                                                                                                                                        | 203/656 [9:37:55<21:30:04, 170.87s/it] 31%|█████████████████████████████████████████████████████████████▎                                                                                                                                       | 204/656 [9:40:49<21:32:24, 171.56s/it]                                                                                                                                                                                                                                                  {'loss': '0.8284', 'grad_norm': '0.6764', 'learning_rate': '0.0001', 'ppl': '2.29', 'memory/max_active (GiB)': '54.92', 'memory/max_allocated (GiB)': '54.92', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '62.36', 'tokens/total': 269475840, 'tokens/trainable': 32856326, 'epoch': '0.6225'}
 31%|█████████████████████████████████████████████████████████████▎                                                                                                                                       | 204/656 [9:40:49<21:32:24, 171.56s/it] 31%|█████████████████████████████████████████████████████████████▌                                                                                                                                       | 205/656 [9:43:39<21:26:45, 171.19s/it]                                                                                                                                                                                                                                                  {'loss': '0.8262', 'grad_norm': '0.7156', 'learning_rate': '0.0001', 'ppl': '2.285', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '54.74', 'tokens/total': 270796800, 'tokens/trainable': 33023512, 'epoch': '0.6256'}
 31%|█████████████████████████████████████████████████████████████▌                                                                                                                                       | 205/656 [9:43:39<21:26:45, 171.19s/it] 31%|█████████████████████████████████████████████████████████████▊                                                                                                                                       | 206/656 [9:46:31<21:26:36, 171.55s/it]                                                                                                                                                                                                                                                  {'loss': '0.8164', 'grad_norm': '0.7354', 'learning_rate': '0.0001', 'ppl': '2.262', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.57', 'tokens/total': 272117760, 'tokens/trainable': 33183842, 'epoch': '0.6286'}
 31%|█████████████████████████████████████████████████████████████▊                                                                                                                                       | 206/656 [9:46:31<21:26:36, 171.55s/it] 32%|██████████████████████████████████████████████████████████████▏                                                                                                                                      | 207/656 [9:49:21<21:20:18, 171.09s/it]                                                                                                                                                                                                                                                  {'loss': '0.8397', 'grad_norm': '0.7208', 'learning_rate': '0.0001', 'ppl': '2.316', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '57.09', 'tokens/total': 273438720, 'tokens/trainable': 33345088, 'epoch': '0.6317'}
 32%|██████████████████████████████████████████████████████████████▏                                                                                                                                      | 207/656 [9:49:21<21:20:18, 171.09s/it] 32%|██████████████████████████████████████████████████████████████▍                                                                                                                                      | 208/656 [9:52:14<21:20:04, 171.44s/it]                                                                                                                                                                                                                                                  {'loss': '0.8072', 'grad_norm': '0.691', 'learning_rate': '0.0001', 'ppl': '2.242', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '51.71', 'tokens/total': 274759680, 'tokens/trainable': 33513700, 'epoch': '0.6347'}
 32%|██████████████████████████████████████████████████████████████▍                                                                                                                                      | 208/656 [9:52:14<21:20:04, 171.44s/it] 32%|██████████████████████████████████████████████████████████████▊                                                                                                                                      | 209/656 [9:55:03<21:11:50, 170.72s/it]                                                                                                                                                                                                                                                  {'loss': '0.8411', 'grad_norm': '0.699', 'learning_rate': '0.0001', 'ppl': '2.319', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '107', 'tokens/total': 276080640, 'tokens/trainable': 33673552, 'epoch': '0.6378'}
 32%|██████████████████████████████████████████████████████████████▊                                                                                                                                      | 209/656 [9:55:03<21:11:50, 170.72s/it] 32%|███████████████████████████████████████████████████████████████                                                                                                                                      | 210/656 [9:57:52<21:05:11, 170.21s/it]                                                                                                                                                                                                                                                  {'loss': '0.9693', 'grad_norm': '1.009', 'learning_rate': '0.0001', 'ppl': '2.636', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.58', 'tokens/total': 277401600, 'tokens/trainable': 33814844, 'epoch': '0.6408'}
 32%|███████████████████████████████████████████████████████████████                                                                                                                                      | 210/656 [9:57:52<21:05:11, 170.21s/it] 32%|███████████████████████████████████████████████████████████████                                                                                                                                     | 211/656 [10:00:45<21:08:18, 171.01s/it]                                                                                                                                                                                                                                                  {'loss': '0.7803', 'grad_norm': '0.6441', 'learning_rate': '0.0001', 'ppl': '2.182', 'memory/max_active (GiB)': '54.98', 'memory/max_allocated (GiB)': '54.98', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '52.2', 'tokens/total': 278722560, 'tokens/trainable': 34005228, 'epoch': '0.6439'}
 32%|███████████████████████████████████████████████████████████████                                                                                                                                     | 211/656 [10:00:45<21:08:18, 171.01s/it] 32%|███████████████████████████████████████████████████████████████▎                                                                                                                                    | 212/656 [10:03:36<21:05:30, 171.01s/it]                                                                                                                                                                                                                                                  {'loss': '0.7688', 'grad_norm': '0.659', 'learning_rate': '0.0001', 'ppl': '2.157', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.2', 'tokens/total': 280043520, 'tokens/trainable': 34193036, 'epoch': '0.6469'}
 32%|███████████████████████████████████████████████████████████████▎                                                                                                                                    | 212/656 [10:03:36<21:05:30, 171.01s/it] 32%|███████████████████████████████████████████████████████████████▋                                                                                                                                    | 213/656 [10:06:26<21:00:29, 170.72s/it]                                                                                                                                                                                                                                                  {'loss': '0.8424', 'grad_norm': '0.7136', 'learning_rate': '0.0001', 'ppl': '2.322', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '37.04', 'tokens/total': 281364480, 'tokens/trainable': 34350208, 'epoch': '0.65'}
 32%|███████████████████████████████████████████████████████████████▋                                                                                                                                    | 213/656 [10:06:26<21:00:29, 170.72s/it] 33%|███████████████████████████████████████████████████████████████▉                                                                                                                                    | 214/656 [10:09:15<20:55:15, 170.40s/it]                                                                                                                                                                                                                                                  {'loss': '0.8392', 'grad_norm': '0.7203', 'learning_rate': '0.0001', 'ppl': '2.314', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '82.15', 'tokens/total': 282685440, 'tokens/trainable': 34507228, 'epoch': '0.653'}
 33%|███████████████████████████████████████████████████████████████▉                                                                                                                                    | 214/656 [10:09:15<20:55:15, 170.40s/it] 33%|████████████████████████████████████████████████████████████████▏                                                                                                                                   | 215/656 [10:12:04<20:49:23, 169.98s/it]                                                                                                                                                                                                                                                  {'loss': '0.9284', 'grad_norm': '0.7431', 'learning_rate': '0.0001', 'ppl': '2.531', 'memory/max_active (GiB)': '54.75', 'memory/max_allocated (GiB)': '54.75', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '25.95', 'tokens/total': 284006400, 'tokens/trainable': 34648148, 'epoch': '0.6561'}
 33%|████████████████████████████████████████████████████████████████▏                                                                                                                                   | 215/656 [10:12:04<20:49:23, 169.98s/it] 33%|████████████████████████████████████████████████████████████████▌                                                                                                                                   | 216/656 [10:14:54<20:45:52, 169.89s/it]                                                                                                                                                                                                                                                  {'loss': '0.8577', 'grad_norm': '0.7488', 'learning_rate': '0.0001', 'ppl': '2.358', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '25.93', 'tokens/total': 285327360, 'tokens/trainable': 34788760, 'epoch': '0.6591'}
 33%|████████████████████████████████████████████████████████████████▌                                                                                                                                   | 216/656 [10:14:54<20:45:52, 169.89s/it] 33%|████████████████████████████████████████████████████████████████▊                                                                                                                                   | 217/656 [10:17:44<20:42:32, 169.82s/it]                                                                                                                                                                                                                                                  {'loss': '0.8792', 'grad_norm': '0.7326', 'learning_rate': '0.0001', 'ppl': '2.409', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '25.71', 'tokens/total': 286648320, 'tokens/trainable': 34940056, 'epoch': '0.6622'}
 33%|████████████████████████████████████████████████████████████████▊                                                                                                                                   | 217/656 [10:17:44<20:42:32, 169.82s/it] 33%|█████████████████████████████████████████████████████████████████▏                                                                                                                                  | 218/656 [10:20:36<20:45:48, 170.66s/it]                                                                                                                                                                                                                                                  {'loss': '0.8211', 'grad_norm': '0.7155', 'learning_rate': '0.0001', 'ppl': '2.273', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '39.04', 'tokens/total': 287969280, 'tokens/trainable': 35100096, 'epoch': '0.6652'}
 33%|█████████████████████████████████████████████████████████████████▏                                                                                                                                  | 218/656 [10:20:36<20:45:48, 170.66s/it] 33%|█████████████████████████████████████████████████████████████████▍                                                                                                                                  | 219/656 [10:23:27<20:43:47, 170.77s/it]                                                                                                                                                                                                                                                  {'loss': '0.8477', 'grad_norm': '0.7159', 'learning_rate': '0.0001', 'ppl': '2.334', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '39.29', 'tokens/total': 289290240, 'tokens/trainable': 35258992, 'epoch': '0.6683'}
 33%|█████████████████████████████████████████████████████████████████▍                                                                                                                                  | 219/656 [10:23:27<20:43:47, 170.77s/it] 34%|█████████████████████████████████████████████████████████████████▋                                                                                                                                  | 220/656 [10:26:20<20:44:17, 171.23s/it]                                                                                                                                                                                                                                                  {'loss': '0.8218', 'grad_norm': '0.7029', 'learning_rate': '0.0001', 'ppl': '2.275', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '38.35', 'tokens/total': 290611200, 'tokens/trainable': 35427076, 'epoch': '0.6713'}
 34%|█████████████████████████████████████████████████████████████████▋                                                                                                                                  | 220/656 [10:26:20<20:44:17, 171.23s/it] 34%|██████████████████████████████████████████████████████████████████                                                                                                                                  | 221/656 [10:29:11<20:41:14, 171.21s/it]                                                                                                                                                                                                                                                  {'loss': '0.8202', 'grad_norm': '0.6837', 'learning_rate': '0.0001', 'ppl': '2.271', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '38.16', 'tokens/total': 291932160, 'tokens/trainable': 35595668, 'epoch': '0.6744'}
 34%|██████████████████████████████████████████████████████████████████                                                                                                                                  | 221/656 [10:29:11<20:41:14, 171.21s/it] 34%|██████████████████████████████████████████████████████████████████▎                                                                                                                                 | 222/656 [10:32:00<20:33:43, 170.56s/it]                                                                                                                                                                                                                                                  {'loss': '0.8215', 'grad_norm': '0.7026', 'learning_rate': '0.0001', 'ppl': '2.274', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.57', 'tokens/total': 293253120, 'tokens/trainable': 35755768, 'epoch': '0.6774'}
 34%|██████████████████████████████████████████████████████████████████▎                                                                                                                                 | 222/656 [10:32:00<20:33:43, 170.56s/it] 34%|██████████████████████████████████████████████████████████████████▋                                                                                                                                 | 223/656 [10:34:51<20:32:41, 170.81s/it]                                                                                                                                                                                                                                                  {'loss': '0.8239', 'grad_norm': '0.7077', 'learning_rate': '0.0001', 'ppl': '2.279', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.92', 'tokens/total': 294574080, 'tokens/trainable': 35917616, 'epoch': '0.6805'}
 34%|██████████████████████████████████████████████████████████████████▋                                                                                                                                 | 223/656 [10:34:51<20:32:41, 170.81s/it] 34%|██████████████████████████████████████████████████████████████████▉                                                                                                                                 | 224/656 [10:37:41<20:27:27, 170.48s/it]                                                                                                                                                                                                                                                  {'loss': '0.8197', 'grad_norm': '0.7058', 'learning_rate': '0.0001', 'ppl': '2.27', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '19.14', 'tokens/total': 295895040, 'tokens/trainable': 36075088, 'epoch': '0.6836'}
 34%|██████████████████████████████████████████████████████████████████▉                                                                                                                                 | 224/656 [10:37:41<20:27:27, 170.48s/it] 34%|███████████████████████████████████████████████████████████████████▏                                                                                                                                | 225/656 [10:40:31<20:23:21, 170.30s/it]                                                                                                                                                                                                                                                  {'loss': '0.7721', 'grad_norm': '0.6607', 'learning_rate': '0.0001', 'ppl': '2.164', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '42.97', 'tokens/total': 297216000, 'tokens/trainable': 36250024, 'epoch': '0.6866'}
 34%|███████████████████████████████████████████████████████████████████▏                                                                                                                                | 225/656 [10:40:31<20:23:21, 170.30s/it] 34%|███████████████████████████████████████████████████████████████████▌                                                                                                                                | 226/656 [10:43:23<20:25:09, 170.95s/it]                                                                                                                                                                                                                                                  {'loss': '0.8412', 'grad_norm': '0.7368', 'learning_rate': '0.0001', 'ppl': '2.319', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '64.74', 'tokens/total': 298536960, 'tokens/trainable': 36407152, 'epoch': '0.6897'}
 34%|███████████████████████████████████████████████████████████████████▌                                                                                                                                | 226/656 [10:43:23<20:25:09, 170.95s/it] 35%|███████████████████████████████████████████████████████████████████▊                                                                                                                                | 227/656 [10:46:14<20:21:20, 170.82s/it]                                                                                                                                                                                                                                                  {'loss': '0.9096', 'grad_norm': '0.7319', 'learning_rate': '0.0001', 'ppl': '2.483', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '45.58', 'tokens/total': 299857920, 'tokens/trainable': 36556944, 'epoch': '0.6927'}
 35%|███████████████████████████████████████████████████████████████████▊                                                                                                                                | 227/656 [10:46:14<20:21:20, 170.82s/it] 35%|████████████████████████████████████████████████████████████████████                                                                                                                                | 228/656 [10:49:05<20:18:30, 170.82s/it]                                                                                                                                                                                                                                                  {'loss': '0.8427', 'grad_norm': '0.7391', 'learning_rate': '0.0001', 'ppl': '2.323', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '18.29', 'tokens/total': 301178880, 'tokens/trainable': 36695876, 'epoch': '0.6958'}
 35%|████████████████████████████████████████████████████████████████████                                                                                                                                | 228/656 [10:49:05<20:18:30, 170.82s/it] 35%|████████████████████████████████████████████████████████████████████▍                                                                                                                               | 229/656 [10:51:55<20:15:22, 170.78s/it]                                                                                                                                                                                                                                                  {'loss': '0.8799', 'grad_norm': '0.6971', 'learning_rate': '0.0001', 'ppl': '2.411', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '55.59', 'tokens/total': 302499840, 'tokens/trainable': 36857812, 'epoch': '0.6988'}
 35%|████████████████████████████████████████████████████████████████████▍                                                                                                                               | 229/656 [10:51:55<20:15:22, 170.78s/it] 35%|████████████████████████████████████████████████████████████████████▋                                                                                                                               | 230/656 [10:54:46<20:11:47, 170.68s/it]                                                                                                                                                                                                                                                  {'loss': '0.8511', 'grad_norm': '0.7153', 'learning_rate': '0.0001', 'ppl': '2.342', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '70.16', 'tokens/total': 303820800, 'tokens/trainable': 37014428, 'epoch': '0.7019'}
 35%|████████████████████████████████████████████████████████████████████▋                                                                                                                               | 230/656 [10:54:46<20:11:47, 170.68s/it] 35%|█████████████████████████████████████████████████████████████████████                                                                                                                               | 231/656 [10:57:38<20:12:07, 171.12s/it]                                                                                                                                                                                                                                                  {'loss': '0.8289', 'grad_norm': '0.6637', 'learning_rate': '0.0001', 'ppl': '2.291', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '34.03', 'tokens/total': 305141760, 'tokens/trainable': 37191948, 'epoch': '0.7049'}
 35%|█████████████████████████████████████████████████████████████████████                                                                                                                               | 231/656 [10:57:38<20:12:07, 171.12s/it] 35%|█████████████████████████████████████████████████████████████████████▎                                                                                                                              | 232/656 [11:00:26<20:03:41, 170.33s/it]                                                                                                                                                                                                                                                  {'loss': '0.8185', 'grad_norm': '0.6903', 'learning_rate': '0.0001', 'ppl': '2.267', 'memory/max_active (GiB)': '54.78', 'memory/max_allocated (GiB)': '54.78', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.09', 'tokens/total': 306462720, 'tokens/trainable': 37351264, 'epoch': '0.708'}
 35%|█████████████████████████████████████████████████████████████████████▎                                                                                                                              | 232/656 [11:00:26<20:03:41, 170.33s/it] 36%|█████████████████████████████████████████████████████████████████████▌                                                                                                                              | 233/656 [11:03:15<19:58:03, 169.94s/it]                                                                                                                                                                                                                                                  {'loss': '0.804', 'grad_norm': '0.6933', 'learning_rate': '0.0001', 'ppl': '2.234', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '47.23', 'tokens/total': 307783680, 'tokens/trainable': 37512276, 'epoch': '0.711'}
 36%|█████████████████████████████████████████████████████████████████████▌                                                                                                                              | 233/656 [11:03:16<19:58:03, 169.94s/it] 36%|█████████████████████████████████████████████████████████████████████▉                                                                                                                              | 234/656 [11:06:08<20:00:51, 170.74s/it]                                                                                                                                                                                                                                                  {'loss': '0.8204', 'grad_norm': '0.7101', 'learning_rate': '0.0001', 'ppl': '2.271', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '56.17', 'tokens/total': 309104640, 'tokens/trainable': 37669556, 'epoch': '0.7141'}
 36%|█████████████████████████████████████████████████████████████████████▉                                                                                                                              | 234/656 [11:06:08<20:00:51, 170.74s/it] 36%|██████████████████████████████████████████████████████████████████████▏                                                                                                                             | 235/656 [11:09:00<20:00:31, 171.10s/it]                                                                                                                                                                                                                                                  {'loss': '0.8002', 'grad_norm': '0.6361', 'learning_rate': '0.0001', 'ppl': '2.226', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '55.79', 'tokens/total': 310425600, 'tokens/trainable': 37859880, 'epoch': '0.7171'}
 36%|██████████████████████████████████████████████████████████████████████▏                                                                                                                             | 235/656 [11:09:00<20:00:31, 171.10s/it] 36%|██████████████████████████████████████████████████████████████████████▌                                                                                                                             | 236/656 [11:11:52<19:59:34, 171.37s/it]                                                                                                                                                                                                                                                  {'loss': '0.8442', 'grad_norm': '0.6839', 'learning_rate': '0.0001', 'ppl': '2.326', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '56.58', 'tokens/total': 311746560, 'tokens/trainable': 38032388, 'epoch': '0.7202'}
 36%|██████████████████████████████████████████████████████████████████████▌                                                                                                                             | 236/656 [11:11:52<19:59:34, 171.37s/it] 36%|██████████████████████████████████████████████████████████████████████▊                                                                                                                             | 237/656 [11:14:41<19:51:41, 170.65s/it]                                                                                                                                                                                                                                                  {'loss': '0.855', 'grad_norm': '0.7647', 'learning_rate': '0.0001', 'ppl': '2.351', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '32.15', 'tokens/total': 313067520, 'tokens/trainable': 38181844, 'epoch': '0.7232'}
 36%|██████████████████████████████████████████████████████████████████████▊                                                                                                                             | 237/656 [11:14:41<19:51:41, 170.65s/it] 36%|███████████████████████████████████████████████████████████████████████                                                                                                                             | 238/656 [11:17:30<19:46:20, 170.29s/it]                                                                                                                                                                                                                                                  {'loss': '0.8049', 'grad_norm': '0.7102', 'learning_rate': '0.0001', 'ppl': '2.237', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.82', 'tokens/total': 314388480, 'tokens/trainable': 38344036, 'epoch': '0.7263'}
 36%|███████████████████████████████████████████████████████████████████████                                                                                                                             | 238/656 [11:17:30<19:46:20, 170.29s/it] 36%|███████████████████████████████████████████████████████████████████████▍                                                                                                                            | 239/656 [11:20:22<19:45:28, 170.57s/it]                                                                                                                                                                                                                                                  {'loss': '0.8475', 'grad_norm': '0.7085', 'learning_rate': '0.0001', 'ppl': '2.334', 'memory/max_active (GiB)': '54.93', 'memory/max_allocated (GiB)': '54.93', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '52.91', 'tokens/total': 315709440, 'tokens/trainable': 38488464, 'epoch': '0.7293'}
 36%|███████████████████████████████████████████████████████████████████████▍                                                                                                                            | 239/656 [11:20:22<19:45:28, 170.57s/it] 37%|███████████████████████████████████████████████████████████████████████▋                                                                                                                            | 240/656 [11:23:15<19:48:19, 171.39s/it]                                                                                                                                                                                                                                                  {'loss': '0.8271', 'grad_norm': '0.6932', 'learning_rate': '0.0001', 'ppl': '2.287', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.95', 'tokens/total': 317030400, 'tokens/trainable': 38647968, 'epoch': '0.7324'}
 37%|███████████████████████████████████████████████████████████████████████▋                                                                                                                            | 240/656 [11:23:15<19:48:19, 171.39s/it] 37%|████████████████████████████████████████████████████████████████████████                                                                                                                            | 241/656 [11:26:05<19:42:58, 171.03s/it]                                                                                                                                                                                                                                                  {'loss': '0.8588', 'grad_norm': '0.7047', 'learning_rate': '0.0001', 'ppl': '2.36', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.21', 'tokens/total': 318351360, 'tokens/trainable': 38809944, 'epoch': '0.7354'}
 37%|████████████████████████████████████████████████████████████████████████                                                                                                                            | 241/656 [11:26:05<19:42:58, 171.03s/it] 37%|████████████████████████████████████████████████████████████████████████▎                                                                                                                           | 242/656 [11:28:56<19:39:58, 171.01s/it]                                                                                                                                                                                                                                                  {'loss': '0.8562', 'grad_norm': '0.7081', 'learning_rate': '0.0001', 'ppl': '2.354', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.93', 'tokens/total': 319672320, 'tokens/trainable': 38964340, 'epoch': '0.7385'}
 37%|████████████████████████████████████████████████████████████████████████▎                                                                                                                           | 242/656 [11:28:56<19:39:58, 171.01s/it] 37%|████████████████████████████████████████████████████████████████████████▌                                                                                                                           | 243/656 [11:31:47<19:37:31, 171.07s/it]                                                                                                                                                                                                                                                  {'loss': '0.8204', 'grad_norm': '0.7476', 'learning_rate': '0.0001', 'ppl': '2.271', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '32', 'tokens/total': 320993280, 'tokens/trainable': 39104968, 'epoch': '0.7415'}
 37%|████████████████████████████████████████████████████████████████████████▌                                                                                                                           | 243/656 [11:31:47<19:37:31, 171.07s/it] 37%|████████████████████████████████████████████████████████████████████████▉                                                                                                                           | 244/656 [11:34:37<19:32:12, 170.71s/it]                                                                                                                                                                                                                                                  {'loss': '0.8317', 'grad_norm': '0.7147', 'learning_rate': '0.0001', 'ppl': '2.297', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '42.64', 'tokens/total': 322314240, 'tokens/trainable': 39255800, 'epoch': '0.7446'}
 37%|████████████████████████████████████████████████████████████████████████▉                                                                                                                           | 244/656 [11:34:37<19:32:12, 170.71s/it] 37%|█████████████████████████████████████████████████████████████████████████▏                                                                                                                          | 245/656 [11:37:29<19:32:35, 171.18s/it]                                                                                                                                                                                                                                                  {'loss': '0.8149', 'grad_norm': '0.6978', 'learning_rate': '0.0001', 'ppl': '2.259', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '38.78', 'tokens/total': 323635200, 'tokens/trainable': 39422928, 'epoch': '0.7476'}
 37%|█████████████████████████████████████████████████████████████████████████▏                                                                                                                          | 245/656 [11:37:29<19:32:35, 171.18s/it] 38%|█████████████████████████████████████████████████████████████████████████▌                                                                                                                          | 246/656 [11:40:23<19:34:34, 171.89s/it]                                                                                                                                                                                                                                                  {'loss': '0.8', 'grad_norm': '0.6435', 'learning_rate': '0.0001', 'ppl': '2.226', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '92.37', 'tokens/total': 324956160, 'tokens/trainable': 39610652, 'epoch': '0.7507'}
 38%|█████████████████████████████████████████████████████████████████████████▌                                                                                                                          | 246/656 [11:40:23<19:34:34, 171.89s/it] 38%|█████████████████████████████████████████████████████████████████████████▊                                                                                                                          | 247/656 [11:43:12<19:26:50, 171.17s/it]                                                                                                                                                                                                                                                  {'loss': '0.8209', 'grad_norm': '0.7226', 'learning_rate': '0.0001', 'ppl': '2.273', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '26.72', 'tokens/total': 326277120, 'tokens/trainable': 39770868, 'epoch': '0.7537'}
 38%|█████████████████████████████████████████████████████████████████████████▊                                                                                                                          | 247/656 [11:43:12<19:26:50, 171.17s/it] 38%|██████████████████████████████████████████████████████████████████████████                                                                                                                          | 248/656 [11:46:04<19:25:09, 171.35s/it]                                                                                                                                                                                                                                                  {'loss': '0.8119', 'grad_norm': '0.7162', 'learning_rate': '0.0001', 'ppl': '2.252', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '35.35', 'tokens/total': 327598080, 'tokens/trainable': 39928984, 'epoch': '0.7568'}
 38%|██████████████████████████████████████████████████████████████████████████                                                                                                                          | 248/656 [11:46:04<19:25:09, 171.35s/it] 38%|██████████████████████████████████████████████████████████████████████████▍                                                                                                                         | 249/656 [11:48:55<19:20:25, 171.07s/it]                                                                                                                                                                                                                                                  {'loss': '0.8504', 'grad_norm': '0.6988', 'learning_rate': '0.0001', 'ppl': '2.341', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.73', 'tokens/total': 328919040, 'tokens/trainable': 40095336, 'epoch': '0.7598'}
 38%|██████████████████████████████████████████████████████████████████████████▍                                                                                                                         | 249/656 [11:48:55<19:20:25, 171.07s/it] 38%|██████████████████████████████████████████████████████████████████████████▋                                                                                                                         | 250/656 [11:51:47<19:19:46, 171.40s/it]                                                                                                                                                                                                                                                  {'loss': '0.794', 'grad_norm': '0.6783', 'learning_rate': '0.0001', 'ppl': '2.212', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.4', 'tokens/total': 330240000, 'tokens/trainable': 40269760, 'epoch': '0.7629'}
 38%|██████████████████████████████████████████████████████████████████████████▋                                                                                                                         | 250/656 [11:51:47<19:19:46, 171.40s/it] 38%|██████████████████████████████████████████████████████████████████████████▉                                                                                                                         | 251/656 [11:54:36<19:13:14, 170.85s/it]                                                                                                                                                                                                                                                  {'loss': '0.8561', 'grad_norm': '0.6987', 'learning_rate': '0.0001', 'ppl': '2.354', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.44', 'tokens/total': 331560960, 'tokens/trainable': 40427648, 'epoch': '0.7659'}
 38%|██████████████████████████████████████████████████████████████████████████▉                                                                                                                         | 251/656 [11:54:36<19:13:14, 170.85s/it] 38%|███████████████████████████████████████████████████████████████████████████▎                                                                                                                        | 252/656 [11:57:30<19:15:23, 171.59s/it]                                                                                                                                                                                                                                                  {'loss': '0.7864', 'grad_norm': '0.6854', 'learning_rate': '0.0001', 'ppl': '2.195', 'memory/max_active (GiB)': '54.92', 'memory/max_allocated (GiB)': '54.92', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.62', 'tokens/total': 332881920, 'tokens/trainable': 40584128, 'epoch': '0.769'}
 38%|███████████████████████████████████████████████████████████████████████████▎                                                                                                                        | 252/656 [11:57:30<19:15:23, 171.59s/it] 39%|███████████████████████████████████████████████████████████████████████████▌                                                                                                                        | 253/656 [12:00:21<19:12:53, 171.65s/it]                                                                                                                                                                                                                                                  {'loss': '0.7673', 'grad_norm': '0.6596', 'learning_rate': '0.0001', 'ppl': '2.154', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '66.38', 'tokens/total': 334202880, 'tokens/trainable': 40755576, 'epoch': '0.772'}
 39%|███████████████████████████████████████████████████████████████████████████▌                                                                                                                        | 253/656 [12:00:21<19:12:53, 171.65s/it] 39%|███████████████████████████████████████████████████████████████████████████▉                                                                                                                        | 254/656 [12:03:14<19:12:09, 171.96s/it]                                                                                                                                                                                                                                                  {'loss': '0.7863', 'grad_norm': '0.731', 'learning_rate': '0.0001', 'ppl': '2.195', 'memory/max_active (GiB)': '54.92', 'memory/max_allocated (GiB)': '54.92', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '41.03', 'tokens/total': 335523840, 'tokens/trainable': 40931168, 'epoch': '0.7751'}
 39%|███████████████████████████████████████████████████████████████████████████▉                                                                                                                        | 254/656 [12:03:14<19:12:09, 171.96s/it] 39%|████████████████████████████████████████████████████████████████████████████▏                                                                                                                       | 255/656 [12:06:05<19:07:31, 171.70s/it]                                                                                                                                                                                                                                                  {'loss': '0.8349', 'grad_norm': '0.6664', 'learning_rate': '0.0001', 'ppl': '2.305', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '51.03', 'tokens/total': 336844800, 'tokens/trainable': 41115536, 'epoch': '0.7782'}
 39%|████████████████████████████████████████████████████████████████████████████▏                                                                                                                       | 255/656 [12:06:05<19:07:31, 171.70s/it] 39%|████████████████████████████████████████████████████████████████████████████▍                                                                                                                       | 256/656 [12:08:55<19:01:52, 171.28s/it]                                                                                                                                                                                                                                                  {'loss': '0.842', 'grad_norm': '0.6611', 'learning_rate': '0.0001', 'ppl': '2.321', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '91.08', 'tokens/total': 338165760, 'tokens/trainable': 41302344, 'epoch': '0.7812'}
 39%|████████████████████████████████████████████████████████████████████████████▍                                                                                                                       | 256/656 [12:08:56<19:01:52, 171.28s/it] 39%|████████████████████████████████████████████████████████████████████████████▊                                                                                                                       | 257/656 [12:11:48<19:01:13, 171.61s/it]                                                                                                                                                                                                                                                  {'loss': '0.8349', 'grad_norm': '0.6539', 'learning_rate': '0.0001', 'ppl': '2.305', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '58.04', 'tokens/total': 339486720, 'tokens/trainable': 41482640, 'epoch': '0.7843'}
 39%|████████████████████████████████████████████████████████████████████████████▊                                                                                                                       | 257/656 [12:11:48<19:01:13, 171.61s/it] 39%|█████████████████████████████████████████████████████████████████████████████                                                                                                                       | 258/656 [12:14:38<18:54:49, 171.08s/it]                                                                                                                                                                                                                                                  {'loss': '0.8725', 'grad_norm': '0.7297', 'learning_rate': '0.0001', 'ppl': '2.393', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '51.42', 'tokens/total': 340807680, 'tokens/trainable': 41631160, 'epoch': '0.7873'}
 39%|█████████████████████████████████████████████████████████████████████████████                                                                                                                       | 258/656 [12:14:38<18:54:49, 171.08s/it] 39%|█████████████████████████████████████████████████████████████████████████████▍                                                                                                                      | 259/656 [12:17:30<18:54:04, 171.40s/it]                                                                                                                                                                                                                                                  {'loss': '0.8773', 'grad_norm': '0.7058', 'learning_rate': '0.0001', 'ppl': '2.405', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '42.11', 'tokens/total': 342128640, 'tokens/trainable': 41791900, 'epoch': '0.7904'}
 39%|█████████████████████████████████████████████████████████████████████████████▍                                                                                                                      | 259/656 [12:17:30<18:54:04, 171.40s/it] 40%|█████████████████████████████████████████████████████████████████████████████▋                                                                                                                      | 260/656 [12:20:21<18:50:56, 171.36s/it]                                                                                                                                                                                                                                                  {'loss': '0.8225', 'grad_norm': '0.6984', 'learning_rate': '0.0001', 'ppl': '2.276', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '30.26', 'tokens/total': 343449600, 'tokens/trainable': 41956100, 'epoch': '0.7934'}
 40%|█████████████████████████████████████████████████████████████████████████████▋                                                                                                                      | 260/656 [12:20:21<18:50:56, 171.36s/it] 40%|█████████████████████████████████████████████████████████████████████████████▉                                                                                                                      | 261/656 [12:23:11<18:44:57, 170.88s/it]                                                                                                                                                                                                                                                  {'loss': '0.9216', 'grad_norm': '0.6955', 'learning_rate': '0.0001', 'ppl': '2.513', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '58.62', 'tokens/total': 344770560, 'tokens/trainable': 42117104, 'epoch': '0.7965'}
 40%|█████████████████████████████████████████████████████████████████████████████▉                                                                                                                      | 261/656 [12:23:11<18:44:57, 170.88s/it] 40%|██████████████████████████████████████████████████████████████████████████████▎                                                                                                                     | 262/656 [12:26:01<18:40:29, 170.63s/it]                                                                                                                                                                                                                                                  {'loss': '0.8161', 'grad_norm': '0.7194', 'learning_rate': '0.0001', 'ppl': '2.262', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.95', 'tokens/total': 346091520, 'tokens/trainable': 42266236, 'epoch': '0.7995'}
 40%|██████████████████████████████████████████████████████████████████████████████▎                                                                                                                     | 262/656 [12:26:01<18:40:29, 170.63s/it] 40%|██████████████████████████████████████████████████████████████████████████████▌                                                                                                                     | 263/656 [12:28:51<18:37:14, 170.57s/it]                                                                                                                                                                                                                                                  {'loss': '0.8325', 'grad_norm': '0.665', 'learning_rate': '0.0001', 'ppl': '2.299', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '45.23', 'tokens/total': 347412480, 'tokens/trainable': 42435364, 'epoch': '0.8026'}
 40%|██████████████████████████████████████████████████████████████████████████████▌                                                                                                                     | 263/656 [12:28:51<18:37:14, 170.57s/it] 40%|██████████████████████████████████████████████████████████████████████████████▉                                                                                                                     | 264/656 [12:31:41<18:32:10, 170.23s/it]                                                                                                                                                                                                                                                  {'loss': '0.831', 'grad_norm': '0.8083', 'learning_rate': '0.0001', 'ppl': '2.296', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.19', 'tokens/total': 348733440, 'tokens/trainable': 42586380, 'epoch': '0.8056'}
 40%|██████████████████████████████████████████████████████████████████████████████▉                                                                                                                     | 264/656 [12:31:41<18:32:10, 170.23s/it] 40%|███████████████████████████████████████████████████████████████████████████████▏                                                                                                                    | 265/656 [12:34:32<18:30:55, 170.48s/it]                                                                                                                                                                                                                                                  {'loss': '0.8673', 'grad_norm': '0.6906', 'learning_rate': '0.0001', 'ppl': '2.38', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '32.08', 'tokens/total': 350054400, 'tokens/trainable': 42750116, 'epoch': '0.8087'}
 40%|███████████████████████████████████████████████████████████████████████████████▏                                                                                                                    | 265/656 [12:34:32<18:30:55, 170.48s/it] 41%|███████████████████████████████████████████████████████████████████████████████▍                                                                                                                    | 266/656 [12:37:23<18:29:27, 170.68s/it]                                                                                                                                                                                                                                                  {'loss': '0.8176', 'grad_norm': '0.6794', 'learning_rate': '0.0001', 'ppl': '2.265', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '49.75', 'tokens/total': 351375360, 'tokens/trainable': 42910088, 'epoch': '0.8117'}
 41%|███████████████████████████████████████████████████████████████████████████████▍                                                                                                                    | 266/656 [12:37:23<18:29:27, 170.68s/it] 41%|███████████████████████████████████████████████████████████████████████████████▊                                                                                                                    | 267/656 [12:40:14<18:26:24, 170.65s/it]                                                                                                                                                                                                                                                  {'loss': '0.7903', 'grad_norm': '0.6635', 'learning_rate': '0.0001', 'ppl': '2.204', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '46.69', 'tokens/total': 352696320, 'tokens/trainable': 43077124, 'epoch': '0.8148'}
 41%|███████████████████████████████████████████████████████████████████████████████▊                                                                                                                    | 267/656 [12:40:14<18:26:24, 170.65s/it] 41%|████████████████████████████████████████████████████████████████████████████████                                                                                                                    | 268/656 [12:43:05<18:24:53, 170.86s/it]                                                                                                                                                                                                                                                  {'loss': '0.7891', 'grad_norm': '0.6807', 'learning_rate': '0.0001', 'ppl': '2.201', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '74.41', 'tokens/total': 354017280, 'tokens/trainable': 43239352, 'epoch': '0.8178'}
 41%|████████████████████████████████████████████████████████████████████████████████                                                                                                                    | 268/656 [12:43:05<18:24:53, 170.86s/it] 41%|████████████████████████████████████████████████████████████████████████████████▎                                                                                                                   | 269/656 [12:45:55<18:21:07, 170.72s/it]                                                                                                                                                                                                                                                  {'loss': '0.7905', 'grad_norm': '0.6735', 'learning_rate': '0.0001', 'ppl': '2.205', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '39.07', 'tokens/total': 355338240, 'tokens/trainable': 43396944, 'epoch': '0.8209'}
 41%|████████████████████████████████████████████████████████████████████████████████▎                                                                                                                   | 269/656 [12:45:55<18:21:07, 170.72s/it] 41%|████████████████████████████████████████████████████████████████████████████████▋                                                                                                                   | 270/656 [12:48:45<18:16:54, 170.50s/it]                                                                                                                                                                                                                                                  {'loss': '0.786', 'grad_norm': '0.6386', 'learning_rate': '0.0001', 'ppl': '2.195', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.92', 'tokens/total': 356659200, 'tokens/trainable': 43581068, 'epoch': '0.8239'}
 41%|████████████████████████████████████████████████████████████████████████████████▋                                                                                                                   | 270/656 [12:48:45<18:16:54, 170.50s/it] 41%|████████████████████████████████████████████████████████████████████████████████▉                                                                                                                   | 271/656 [12:51:36<18:14:36, 170.59s/it]                                                                                                                                                                                                                                                  {'loss': '0.8229', 'grad_norm': '0.6927', 'learning_rate': '0.0001', 'ppl': '2.277', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '55.82', 'tokens/total': 357980160, 'tokens/trainable': 43752608, 'epoch': '0.827'}
 41%|████████████████████████████████████████████████████████████████████████████████▉                                                                                                                   | 271/656 [12:51:36<18:14:36, 170.59s/it] 41%|█████████████████████████████████████████████████████████████████████████████████▎                                                                                                                  | 272/656 [12:54:30<18:17:07, 171.43s/it]                                                                                                                                                                                                                                                  {'loss': '0.8896', 'grad_norm': '0.669', 'learning_rate': '0.0001', 'ppl': '2.434', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '69.31', 'tokens/total': 359301120, 'tokens/trainable': 43930440, 'epoch': '0.83'}
 41%|█████████████████████████████████████████████████████████████████████████████████▎                                                                                                                  | 272/656 [12:54:30<18:17:07, 171.43s/it] 42%|█████████████████████████████████████████████████████████████████████████████████▌                                                                                                                  | 273/656 [12:57:19<18:11:06, 170.93s/it]                                                                                                                                                                                                                                                  {'loss': '0.8658', 'grad_norm': '0.7415', 'learning_rate': '0.0001', 'ppl': '2.377', 'memory/max_active (GiB)': '54.75', 'memory/max_allocated (GiB)': '54.75', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '48.12', 'tokens/total': 360622080, 'tokens/trainable': 44070004, 'epoch': '0.8331'}
 42%|█████████████████████████████████████████████████████████████████████████████████▌                                                                                                                  | 273/656 [12:57:19<18:11:06, 170.93s/it] 42%|█████████████████████████████████████████████████████████████████████████████████▊                                                                                                                  | 274/656 [13:00:11<18:10:17, 171.25s/it]                                                                                                                                                                                                                                                  {'loss': '0.8597', 'grad_norm': '0.7469', 'learning_rate': '0.0001', 'ppl': '2.362', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '47.22', 'tokens/total': 361943040, 'tokens/trainable': 44222248, 'epoch': '0.8361'}
 42%|█████████████████████████████████████████████████████████████████████████████████▊                                                                                                                  | 274/656 [13:00:11<18:10:17, 171.25s/it] 42%|██████████████████████████████████████████████████████████████████████████████████▏                                                                                                                 | 275/656 [13:03:01<18:04:24, 170.77s/it]                                                                                                                                                                                                                                                  {'loss': '0.7711', 'grad_norm': '0.644', 'learning_rate': '0.0001', 'ppl': '2.162', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '30.64', 'tokens/total': 363264000, 'tokens/trainable': 44395624, 'epoch': '0.8392'}
 42%|██████████████████████████████████████████████████████████████████████████████████▏                                                                                                                 | 275/656 [13:03:01<18:04:24, 170.77s/it] 42%|██████████████████████████████████████████████████████████████████████████████████▍                                                                                                                 | 276/656 [13:05:53<18:03:29, 171.08s/it]                                                                                                                                                                                                                                                  {'loss': '0.8155', 'grad_norm': '0.696', 'learning_rate': '0.0001', 'ppl': '2.26', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '48.98', 'tokens/total': 364584960, 'tokens/trainable': 44556444, 'epoch': '0.8422'}
 42%|██████████████████████████████████████████████████████████████████████████████████▍                                                                                                                 | 276/656 [13:05:53<18:03:29, 171.08s/it] 42%|██████████████████████████████████████████████████████████████████████████████████▊                                                                                                                 | 277/656 [13:08:43<17:58:11, 170.69s/it]                                                                                                                                                                                                                                                  {'loss': '0.7838', 'grad_norm': '0.6685', 'learning_rate': '0.0001', 'ppl': '2.19', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '28.49', 'tokens/total': 365905920, 'tokens/trainable': 44724780, 'epoch': '0.8453'}
 42%|██████████████████████████████████████████████████████████████████████████████████▊                                                                                                                 | 277/656 [13:08:43<17:58:11, 170.69s/it] 42%|███████████████████████████████████████████████████████████████████████████████████                                                                                                                 | 278/656 [13:11:34<17:56:15, 170.84s/it]                                                                                                                                                                                                                                                  {'loss': '0.8741', 'grad_norm': '0.7223', 'learning_rate': '0.0001', 'ppl': '2.397', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '52.47', 'tokens/total': 367226880, 'tokens/trainable': 44880512, 'epoch': '0.8483'}
 42%|███████████████████████████████████████████████████████████████████████████████████                                                                                                                 | 278/656 [13:11:34<17:56:15, 170.84s/it] 43%|███████████████████████████████████████████████████████████████████████████████████▎                                                                                                                | 279/656 [13:14:26<17:56:57, 171.40s/it]                                                                                                                                                                                                                                                  {'loss': '0.8454', 'grad_norm': '0.7031', 'learning_rate': '0.0001', 'ppl': '2.329', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '38.26', 'tokens/total': 368547840, 'tokens/trainable': 45036144, 'epoch': '0.8514'}
 43%|███████████████████████████████████████████████████████████████████████████████████▎                                                                                                                | 279/656 [13:14:26<17:56:57, 171.40s/it] 43%|███████████████████████████████████████████████████████████████████████████████████▋                                                                                                                | 280/656 [13:17:17<17:52:43, 171.18s/it]                                                                                                                                                                                                                                                  {'loss': '0.8028', 'grad_norm': '0.7147', 'learning_rate': '0.0001', 'ppl': '2.232', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '24.82', 'tokens/total': 369868800, 'tokens/trainable': 45190580, 'epoch': '0.8544'}
 43%|███████████████████████████████████████████████████████████████████████████████████▋                                                                                                                | 280/656 [13:17:17<17:52:43, 171.18s/it] 43%|███████████████████████████████████████████████████████████████████████████████████▉                                                                                                                | 281/656 [13:20:06<17:46:30, 170.64s/it]                                                                                                                                                                                                                                                  {'loss': '0.8682', 'grad_norm': '0.6979', 'learning_rate': '0.0001', 'ppl': '2.383', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '23.68', 'tokens/total': 371189760, 'tokens/trainable': 45338552, 'epoch': '0.8575'}
 43%|███████████████████████████████████████████████████████████████████████████████████▉                                                                                                                | 281/656 [13:20:07<17:46:30, 170.64s/it] 43%|████████████████████████████████████████████████████████████████████████████████████▎                                                                                                               | 282/656 [13:22:56<17:42:14, 170.41s/it]                                                                                                                                                                                                                                                  {'loss': '0.7832', 'grad_norm': '0.6704', 'learning_rate': '0.0001', 'ppl': '2.188', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '58.33', 'tokens/total': 372510720, 'tokens/trainable': 45506752, 'epoch': '0.8605'}
 43%|████████████████████████████████████████████████████████████████████████████████████▎                                                                                                               | 282/656 [13:22:56<17:42:14, 170.41s/it] 43%|████████████████████████████████████████████████████████████████████████████████████▌                                                                                                               | 283/656 [13:25:50<17:46:01, 171.48s/it]                                                                                                                                                                                                                                                  {'loss': '0.7831', 'grad_norm': '0.6361', 'learning_rate': '0.0001', 'ppl': '2.188', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '34.79', 'tokens/total': 373831680, 'tokens/trainable': 45694024, 'epoch': '0.8636'}
 43%|████████████████████████████████████████████████████████████████████████████████████▌                                                                                                               | 283/656 [13:25:50<17:46:01, 171.48s/it] 43%|████████████████████████████████████████████████████████████████████████████████████▊                                                                                                               | 284/656 [13:28:39<17:37:42, 170.60s/it]                                                                                                                                                                                                                                                  {'loss': '0.8009', 'grad_norm': '0.6994', 'learning_rate': '0.0001', 'ppl': '2.228', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '56.18', 'tokens/total': 375152640, 'tokens/trainable': 45845368, 'epoch': '0.8666'}
 43%|████████████████████████████████████████████████████████████████████████████████████▊                                                                                                               | 284/656 [13:28:39<17:37:42, 170.60s/it] 43%|█████████████████████████████████████████████████████████████████████████████████████▏                                                                                                              | 285/656 [13:31:30<17:35:00, 170.62s/it]                                                                                                                                                                                                                                                  {'loss': '0.8103', 'grad_norm': '0.7124', 'learning_rate': '0.0001', 'ppl': '2.248', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.88', 'tokens/total': 376473600, 'tokens/trainable': 45993212, 'epoch': '0.8697'}
 43%|█████████████████████████████████████████████████████████████████████████████████████▏                                                                                                              | 285/656 [13:31:30<17:35:00, 170.62s/it] 44%|█████████████████████████████████████████████████████████████████████████████████████▍                                                                                                              | 286/656 [13:34:20<17:32:47, 170.72s/it]                                                                                                                                                                                                                                                  {'loss': '0.8206', 'grad_norm': '0.6994', 'learning_rate': '0.0001', 'ppl': '2.272', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '41.36', 'tokens/total': 377794560, 'tokens/trainable': 46141148, 'epoch': '0.8727'}
 44%|█████████████████████████████████████████████████████████████████████████████████████▍                                                                                                              | 286/656 [13:34:20<17:32:47, 170.72s/it] 44%|█████████████████████████████████████████████████████████████████████████████████████▊                                                                                                              | 287/656 [13:37:13<17:33:35, 171.32s/it]                                                                                                                                                                                                                                                  {'loss': '0.8458', 'grad_norm': '0.7171', 'learning_rate': '0.0001', 'ppl': '2.33', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '46.1', 'tokens/total': 379115520, 'tokens/trainable': 46291556, 'epoch': '0.8758'}
 44%|█████████████████████████████████████████████████████████████████████████████████████▊                                                                                                              | 287/656 [13:37:13<17:33:35, 171.32s/it] 44%|██████████████████████████████████████████████████████████████████████████████████████                                                                                                              | 288/656 [13:40:04<17:29:33, 171.12s/it]                                                                                                                                                                                                                                                  {'loss': '0.8668', 'grad_norm': '0.7037', 'learning_rate': '0.0001', 'ppl': '2.379', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '48.98', 'tokens/total': 380436480, 'tokens/trainable': 46443316, 'epoch': '0.8789'}
 44%|██████████████████████████████████████████████████████████████████████████████████████                                                                                                              | 288/656 [13:40:04<17:29:33, 171.12s/it] 44%|██████████████████████████████████████████████████████████████████████████████████████▎                                                                                                             | 289/656 [13:42:53<17:23:05, 170.53s/it]                                                                                                                                                                                                                                                  {'loss': '0.7953', 'grad_norm': '0.6668', 'learning_rate': '0.0001', 'ppl': '2.215', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '37.69', 'tokens/total': 381757440, 'tokens/trainable': 46612552, 'epoch': '0.8819'}
 44%|██████████████████████████████████████████████████████████████████████████████████████▎                                                                                                             | 289/656 [13:42:53<17:23:05, 170.53s/it] 44%|██████████████████████████████████████████████████████████████████████████████████████▋                                                                                                             | 290/656 [13:45:44<17:20:28, 170.57s/it]                                                                                                                                                                                                                                                  {'loss': '0.7831', 'grad_norm': '0.6681', 'learning_rate': '0.0001', 'ppl': '2.188', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '67.59', 'tokens/total': 383078400, 'tokens/trainable': 46785704, 'epoch': '0.885'}
 44%|██████████████████████████████████████████████████████████████████████████████████████▋                                                                                                             | 290/656 [13:45:44<17:20:28, 170.57s/it] 44%|██████████████████████████████████████████████████████████████████████████████████████▉                                                                                                             | 291/656 [13:48:33<17:15:25, 170.21s/it]                                                                                                                                                                                                                                                  {'loss': '0.786', 'grad_norm': '0.7048', 'learning_rate': '0.0001', 'ppl': '2.195', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '42', 'tokens/total': 384399360, 'tokens/trainable': 46939428, 'epoch': '0.888'}
 44%|██████████████████████████████████████████████████████████████████████████████████████▉                                                                                                             | 291/656 [13:48:33<17:15:25, 170.21s/it] 45%|███████████████████████████████████████████████████████████████████████████████████████▏                                                                                                            | 292/656 [13:51:24<17:13:12, 170.31s/it]                                                                                                                                                                                                                                                  {'loss': '0.8385', 'grad_norm': '0.7221', 'learning_rate': '0.0001', 'ppl': '2.313', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '60.86', 'tokens/total': 385720320, 'tokens/trainable': 47084504, 'epoch': '0.8911'}
 45%|███████████████████████████████████████████████████████████████████████████████████████▏                                                                                                            | 292/656 [13:51:24<17:13:12, 170.31s/it] 45%|███████████████████████████████████████████████████████████████████████████████████████▌                                                                                                            | 293/656 [13:54:13<17:09:31, 170.17s/it]                                                                                                                                                                                                                                                  {'loss': '0.8424', 'grad_norm': '0.6893', 'learning_rate': '0.0001', 'ppl': '2.322', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '68.33', 'tokens/total': 387041280, 'tokens/trainable': 47251600, 'epoch': '0.8941'}
 45%|███████████████████████████████████████████████████████████████████████████████████████▌                                                                                                            | 293/656 [13:54:13<17:09:31, 170.17s/it] 45%|███████████████████████████████████████████████████████████████████████████████████████▊                                                                                                            | 294/656 [13:57:06<17:10:43, 170.84s/it]                                                                                                                                                                                                                                                  {'loss': '0.7704', 'grad_norm': '0.633', 'learning_rate': '0.0001', 'ppl': '2.161', 'memory/max_active (GiB)': '54.92', 'memory/max_allocated (GiB)': '54.92', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '69.9', 'tokens/total': 388362240, 'tokens/trainable': 47425776, 'epoch': '0.8972'}
 45%|███████████████████████████████████████████████████████████████████████████████████████▊                                                                                                            | 294/656 [13:57:06<17:10:43, 170.84s/it] 45%|████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                           | 295/656 [13:59:55<17:05:41, 170.47s/it]                                                                                                                                                                                                                                                  {'loss': '0.8211', 'grad_norm': '0.6555', 'learning_rate': '0.0001', 'ppl': '2.273', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '55.54', 'tokens/total': 389683200, 'tokens/trainable': 47603408, 'epoch': '0.9002'}
 45%|████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                           | 295/656 [13:59:55<17:05:41, 170.47s/it] 45%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                           | 296/656 [14:02:44<16:59:46, 169.96s/it]                                                                                                                                                                                                                                                  {'loss': '0.8118', 'grad_norm': '0.7029', 'learning_rate': '0.0001', 'ppl': '2.252', 'memory/max_active (GiB)': '54.78', 'memory/max_allocated (GiB)': '54.78', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.38', 'tokens/total': 391004160, 'tokens/trainable': 47759852, 'epoch': '0.9033'}
 45%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                           | 296/656 [14:02:44<16:59:46, 169.96s/it] 45%|████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                           | 297/656 [14:05:35<16:58:55, 170.29s/it]                                                                                                                                                                                                                                                  {'loss': '0.8075', 'grad_norm': '0.6788', 'learning_rate': '0.0001', 'ppl': '2.242', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '63.19', 'tokens/total': 392325120, 'tokens/trainable': 47923892, 'epoch': '0.9063'}
 45%|████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                           | 297/656 [14:05:35<16:58:55, 170.29s/it] 45%|█████████████████████████████████████████████████████████████████████████████████████████                                                                                                           | 298/656 [14:08:26<16:56:23, 170.34s/it]                                                                                                                                                                                                                                                  {'loss': '0.7995', 'grad_norm': '0.7084', 'learning_rate': '0.0001', 'ppl': '2.225', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '38.55', 'tokens/total': 393646080, 'tokens/trainable': 48066744, 'epoch': '0.9094'}
 45%|█████████████████████████████████████████████████████████████████████████████████████████                                                                                                           | 298/656 [14:08:26<16:56:23, 170.34s/it] 46%|█████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                          | 299/656 [14:11:17<16:55:57, 170.75s/it]                                                                                                                                                                                                                                                  {'loss': '0.8205', 'grad_norm': '0.6563', 'learning_rate': '0.0001', 'ppl': '2.272', 'memory/max_active (GiB)': '54.93', 'memory/max_allocated (GiB)': '54.93', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '21.91', 'tokens/total': 394967040, 'tokens/trainable': 48229792, 'epoch': '0.9124'}
 46%|█████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                          | 299/656 [14:11:17<16:55:57, 170.75s/it] 46%|█████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                          | 300/656 [14:14:08<16:52:52, 170.71s/it]                                                                                                                                                                                                                                                  {'loss': '0.8825', 'grad_norm': '0.7045', 'learning_rate': '0.0001', 'ppl': '2.417', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.14', 'tokens/total': 396288000, 'tokens/trainable': 48378748, 'epoch': '0.9155'}
 46%|█████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                          | 300/656 [14:14:08<16:52:52, 170.71s/it] 46%|█████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                          | 301/656 [14:17:01<16:54:04, 171.39s/it]                                                                                                                                                                                                                                                  {'loss': '0.799', 'grad_norm': '0.6433', 'learning_rate': '0.0001', 'ppl': '2.223', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '68.5', 'tokens/total': 397608960, 'tokens/trainable': 48563528, 'epoch': '0.9185'}
 46%|█████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                          | 301/656 [14:17:01<16:54:04, 171.39s/it] 46%|██████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                         | 302/656 [14:19:52<16:49:55, 171.17s/it]                                                                                                                                                                                                                                                  {'loss': '0.7873', 'grad_norm': '0.8251', 'learning_rate': '0.0001', 'ppl': '2.197', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '59.9', 'tokens/total': 398929920, 'tokens/trainable': 48742464, 'epoch': '0.9216'}
 46%|██████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                         | 302/656 [14:19:52<16:49:55, 171.17s/it] 46%|██████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                         | 303/656 [14:22:42<16:45:52, 170.97s/it]                                                                                                                                                                                                                                                  {'loss': '0.8068', 'grad_norm': '0.6821', 'learning_rate': '0.0001', 'ppl': '2.241', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '52.1', 'tokens/total': 400250880, 'tokens/trainable': 48906572, 'epoch': '0.9246'}
 46%|██████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                         | 303/656 [14:22:42<16:45:52, 170.97s/it] 46%|██████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                         | 304/656 [14:25:33<16:42:12, 170.83s/it]                                                                                                                                                                                                                                                  {'loss': '0.7781', 'grad_norm': '0.6591', 'learning_rate': '0.0001', 'ppl': '2.177', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.04', 'tokens/total': 401571840, 'tokens/trainable': 49081552, 'epoch': '0.9277'}
 46%|██████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                         | 304/656 [14:25:33<16:42:12, 170.83s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                        | 305/656 [14:28:22<16:36:21, 170.32s/it]                                                                                                                                                                                                                                                  {'loss': '0.8215', 'grad_norm': '0.6862', 'learning_rate': '0.0001', 'ppl': '2.274', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '58.1', 'tokens/total': 402892800, 'tokens/trainable': 49243300, 'epoch': '0.9307'}
 46%|███████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                        | 305/656 [14:28:22<16:36:21, 170.32s/it] 47%|███████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                        | 306/656 [14:31:14<16:35:55, 170.73s/it]                                                                                                                                                                                                                                                  {'loss': '0.8237', 'grad_norm': '0.8382', 'learning_rate': '0.0001', 'ppl': '2.279', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '46.46', 'tokens/total': 404213760, 'tokens/trainable': 49402312, 'epoch': '0.9338'}
 47%|███████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                        | 306/656 [14:31:14<16:35:55, 170.73s/it] 47%|███████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                        | 307/656 [14:34:04<16:32:10, 170.58s/it]                                                                                                                                                                                                                                                  {'loss': '0.864', 'grad_norm': '0.6882', 'learning_rate': '0.0001', 'ppl': '2.373', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '39.95', 'tokens/total': 405534720, 'tokens/trainable': 49561392, 'epoch': '0.9368'}
 47%|███████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                        | 307/656 [14:34:04<16:32:10, 170.58s/it] 47%|████████████████████████████████████████████████████████████████████████████████████████████                                                                                                        | 308/656 [14:36:54<16:29:34, 170.62s/it]                                                                                                                                                                                                                                                  {'loss': '0.8394', 'grad_norm': '0.6969', 'learning_rate': '0.0001', 'ppl': '2.315', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '52.62', 'tokens/total': 406855680, 'tokens/trainable': 49711616, 'epoch': '0.9399'}
 47%|████████████████████████████████████████████████████████████████████████████████████████████                                                                                                        | 308/656 [14:36:54<16:29:34, 170.62s/it] 47%|████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                       | 309/656 [14:39:45<16:27:05, 170.68s/it]                                                                                                                                                                                                                                                  {'loss': '0.8143', 'grad_norm': '0.6767', 'learning_rate': '0.0001', 'ppl': '2.257', 'memory/max_active (GiB)': '54.92', 'memory/max_allocated (GiB)': '54.92', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '48.83', 'tokens/total': 408176640, 'tokens/trainable': 49879336, 'epoch': '0.9429'}
 47%|████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                       | 309/656 [14:39:45<16:27:05, 170.68s/it] 47%|████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                       | 310/656 [14:42:37<16:26:15, 171.03s/it]                                                                                                                                                                                                                                                  {'loss': '0.82', 'grad_norm': '0.6901', 'learning_rate': '0.0001', 'ppl': '2.271', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '59.97', 'tokens/total': 409497600, 'tokens/trainable': 50040136, 'epoch': '0.946'}
 47%|████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                       | 310/656 [14:42:37<16:26:15, 171.03s/it] 47%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                       | 311/656 [14:45:29<16:24:11, 171.16s/it]                                                                                                                                                                                                                                                  {'loss': '0.7552', 'grad_norm': '0.7141', 'learning_rate': '0.0001', 'ppl': '2.128', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '27.03', 'tokens/total': 410818560, 'tokens/trainable': 50213596, 'epoch': '0.949'}
 47%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                       | 311/656 [14:45:29<16:24:11, 171.16s/it] 48%|█████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                      | 312/656 [14:48:19<16:20:04, 170.94s/it]                                                                                                                                                                                                                                                  {'loss': '0.815', 'grad_norm': '0.7084', 'learning_rate': '0.0001', 'ppl': '2.259', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '54.72', 'tokens/total': 412139520, 'tokens/trainable': 50357672, 'epoch': '0.9521'}
 48%|█████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                      | 312/656 [14:48:19<16:20:04, 170.94s/it] 48%|█████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                      | 313/656 [14:51:10<16:17:03, 170.91s/it]                                                                                                                                                                                                                                                  {'loss': '0.7919', 'grad_norm': '0.6746', 'learning_rate': '0.0001', 'ppl': '2.208', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.1', 'tokens/total': 413460480, 'tokens/trainable': 50518992, 'epoch': '0.9551'}
 48%|█████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                      | 313/656 [14:51:10<16:17:03, 170.91s/it] 48%|█████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                      | 314/656 [14:54:00<16:12:20, 170.59s/it]                                                                                                                                                                                                                                                  {'loss': '0.816', 'grad_norm': '0.717', 'learning_rate': '0.0001', 'ppl': '2.261', 'memory/max_active (GiB)': '54.94', 'memory/max_allocated (GiB)': '54.94', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '26.54', 'tokens/total': 414781440, 'tokens/trainable': 50669408, 'epoch': '0.9582'}
 48%|█████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                      | 314/656 [14:54:00<16:12:20, 170.59s/it] 48%|██████████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 315/656 [14:56:51<16:10:29, 170.76s/it]                                                                                                                                                                                                                                                  {'loss': '0.8327', 'grad_norm': '3.855', 'learning_rate': '0.0001', 'ppl': '2.3', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '88.7', 'tokens/total': 416102400, 'tokens/trainable': 50827432, 'epoch': '0.9612'}
 48%|██████████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 315/656 [14:56:51<16:10:29, 170.76s/it] 48%|██████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                     | 316/656 [14:59:42<16:08:00, 170.83s/it]                                                                                                                                                                                                                                                  {'loss': '0.7917', 'grad_norm': '0.7203', 'learning_rate': '0.0001', 'ppl': '2.207', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '113', 'tokens/total': 417423360, 'tokens/trainable': 50987704, 'epoch': '0.9643'}
 48%|██████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                     | 316/656 [14:59:42<16:08:00, 170.83s/it] 48%|██████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                     | 317/656 [15:02:31<16:02:37, 170.38s/it]                                                                                                                                                                                                                                                  {'loss': '0.8371', 'grad_norm': '0.7035', 'learning_rate': '0.0001', 'ppl': '2.31', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.51', 'tokens/total': 418744320, 'tokens/trainable': 51140984, 'epoch': '0.9673'}
 48%|██████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                     | 317/656 [15:02:31<16:02:37, 170.38s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████████                                                                                                     | 318/656 [15:05:21<15:58:20, 170.12s/it]                                                                                                                                                                                                                                                  {'loss': '0.8269', 'grad_norm': '0.6886', 'learning_rate': '0.0001', 'ppl': '2.286', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '28.09', 'tokens/total': 420065280, 'tokens/trainable': 51303264, 'epoch': '0.9704'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████████                                                                                                     | 318/656 [15:05:21<15:58:20, 170.12s/it] 49%|███████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                    | 319/656 [15:08:12<15:56:44, 170.34s/it]                                                                                                                                                                                                                                                  {'loss': '0.8212', 'grad_norm': '0.6908', 'learning_rate': '0.0001', 'ppl': '2.273', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '67.58', 'tokens/total': 421386240, 'tokens/trainable': 51459720, 'epoch': '0.9735'}
 49%|███████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                    | 319/656 [15:08:12<15:56:44, 170.34s/it] 49%|███████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                    | 320/656 [15:11:00<15:50:17, 169.70s/it]                                                                                                                                                                                                                                                  {'loss': '0.8577', 'grad_norm': '0.7407', 'learning_rate': '0.0001', 'ppl': '2.358', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '48.03', 'tokens/total': 422707200, 'tokens/trainable': 51598752, 'epoch': '0.9765'}
 49%|███████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                    | 320/656 [15:11:00<15:50:17, 169.70s/it] 49%|███████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                    | 321/656 [15:13:51<15:49:35, 170.08s/it]                                                                                                                                                                                                                                                  {'loss': '0.8305', 'grad_norm': '0.7038', 'learning_rate': '0.0001', 'ppl': '2.295', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '57.94', 'tokens/total': 424028160, 'tokens/trainable': 51750832, 'epoch': '0.9796'}
 49%|███████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                    | 321/656 [15:13:51<15:49:35, 170.08s/it] 49%|████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                   | 322/656 [15:16:44<15:52:34, 171.12s/it]                                                                                                                                                                                                                                                  {'loss': '0.8402', 'grad_norm': '0.6798', 'learning_rate': '0.0001', 'ppl': '2.317', 'memory/max_active (GiB)': '55', 'memory/max_allocated (GiB)': '55', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '46.18', 'tokens/total': 425349120, 'tokens/trainable': 51916920, 'epoch': '0.9826'}
 49%|████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                   | 322/656 [15:16:44<15:52:34, 171.12s/it] 49%|████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                   | 323/656 [15:19:35<15:49:50, 171.14s/it]                                                                                                                                                                                                                                                  {'loss': '0.7921', 'grad_norm': '0.655', 'learning_rate': '0.0001', 'ppl': '2.208', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.71', 'tokens/total': 426670080, 'tokens/trainable': 52089052, 'epoch': '0.9857'}
 49%|████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                   | 323/656 [15:19:35<15:49:50, 171.14s/it] 49%|████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                   | 324/656 [15:22:24<15:42:23, 170.31s/it]                                                                                                                                                                                                                                                  {'loss': '0.8231', 'grad_norm': '0.7278', 'learning_rate': '0.0001', 'ppl': '2.278', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '29.02', 'tokens/total': 427991040, 'tokens/trainable': 52226832, 'epoch': '0.9887'}
 49%|████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                   | 324/656 [15:22:24<15:42:23, 170.31s/it] 50%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                   | 325/656 [15:25:15<15:41:03, 170.58s/it]                                                                                                                                                                                                                                                  {'loss': '0.8237', 'grad_norm': '0.6884', 'learning_rate': '0.0001', 'ppl': '2.279', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '51.1', 'tokens/total': 429312000, 'tokens/trainable': 52381028, 'epoch': '0.9918'}
 50%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                   | 325/656 [15:25:15<15:41:03, 170.58s/it] 50%|█████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                  | 326/656 [15:28:06<15:38:58, 170.72s/it]                                                                                                                                                                                                                                                  {'loss': '0.806', 'grad_norm': '0.7015', 'learning_rate': '0.0001', 'ppl': '2.239', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.15', 'tokens/total': 430632960, 'tokens/trainable': 52539040, 'epoch': '0.9948'}
 50%|█████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                  | 326/656 [15:28:06<15:38:58, 170.72s/it] 50%|█████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                  | 327/656 [15:30:57<15:36:39, 170.82s/it]                                                                                                                                                                                                                                                  {'loss': '0.8054', 'grad_norm': '0.6736', 'learning_rate': '0.0001', 'ppl': '2.238', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '80.41', 'tokens/total': 431953920, 'tokens/trainable': 52706908, 'epoch': '0.9979'}
 50%|█████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                  | 327/656 [15:30:57<15:36:39, 170.82s/it] 50%|██████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                  | 328/656 [15:32:53<14:04:31, 154.49s/it]                                                                                                                                                                                                                                                  {'loss': '0.8396', 'grad_norm': '0.81', 'learning_rate': '0.0001', 'ppl': '2.315', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '86.19', 'tokens/total': 432856576, 'tokens/trainable': 52816800, 'epoch': '1'}
 50%|██████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                  | 328/656 [15:32:53<14:04:31, 154.49s/it][2026-02-12 19:13:01,682] [INFO] [axolotl.core.trainers.base._save:721] [PID:9815] Saving model checkpoint to ./finetune-model-output/checkpoint-328
 50%|██████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                 | 329/656 [15:36:26<15:37:36, 172.04s/it]                                                                                                                                                                                                                                                  {'loss': '0.7012', 'grad_norm': '0.7361', 'learning_rate': '0.0001', 'ppl': '2.016', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '34.39', 'tokens/total': 434177536, 'tokens/trainable': 52964232, 'epoch': '1.003'}
 50%|██████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                 | 329/656 [15:36:27<15:37:36, 172.04s/it] 50%|██████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                 | 330/656 [15:39:16<15:30:41, 171.29s/it]                                                                                                                                                                                                                                                  {'loss': '0.6887', 'grad_norm': '0.7036', 'learning_rate': '0.0001', 'ppl': '1.991', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '33.41', 'tokens/total': 435498496, 'tokens/trainable': 53117272, 'epoch': '1.006'}
 50%|██████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                 | 330/656 [15:39:16<15:30:41, 171.29s/it] 50%|██████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                 | 331/656 [15:42:05<15:24:10, 170.62s/it]                                                                                                                                                                                                                                                  {'loss': '0.6775', 'grad_norm': '0.7834', 'learning_rate': '0.0001', 'ppl': '1.969', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.48', 'tokens/total': 436819456, 'tokens/trainable': 53284852, 'epoch': '1.009'}
 50%|██████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                 | 331/656 [15:42:05<15:24:10, 170.62s/it] 51%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                | 332/656 [15:44:54<15:18:51, 170.16s/it]                                                                                                                                                                                                                                                  {'loss': '0.7191', 'grad_norm': '0.7824', 'learning_rate': '0.0001', 'ppl': '2.053', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '35.34', 'tokens/total': 438140416, 'tokens/trainable': 53433976, 'epoch': '1.012'}
 51%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                | 332/656 [15:44:54<15:18:51, 170.16s/it] 51%|███████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                | 333/656 [15:47:48<15:21:36, 171.20s/it]                                                                                                                                                                                                                                                  {'loss': '0.675', 'grad_norm': '0.6992', 'learning_rate': '0.0001', 'ppl': '1.964', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '91.55', 'tokens/total': 439461376, 'tokens/trainable': 53613688, 'epoch': '1.015'}
 51%|███████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                | 333/656 [15:47:48<15:21:36, 171.20s/it] 51%|███████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                | 334/656 [15:50:36<15:14:26, 170.39s/it]                                                                                                                                                                                                                                                  {'loss': '0.7546', 'grad_norm': '0.7708', 'learning_rate': '0.0001', 'ppl': '2.127', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '55.96', 'tokens/total': 440782336, 'tokens/trainable': 53759236, 'epoch': '1.018'}
 51%|███████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                | 334/656 [15:50:36<15:14:26, 170.39s/it] 51%|████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                | 335/656 [15:53:28<15:14:26, 170.92s/it]                                                                                                                                                                                                                                                  {'loss': '0.7321', 'grad_norm': '0.7202', 'learning_rate': '0.0001', 'ppl': '2.079', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.4', 'tokens/total': 442103296, 'tokens/trainable': 53928840, 'epoch': '1.021'}
 51%|████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                | 335/656 [15:53:28<15:14:26, 170.92s/it] 51%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                               | 336/656 [15:56:19<15:10:35, 170.74s/it]                                                                                                                                                                                                                                                  {'loss': '0.6462', 'grad_norm': '0.6774', 'learning_rate': '0.0001', 'ppl': '1.908', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '87.05', 'tokens/total': 443424256, 'tokens/trainable': 54112048, 'epoch': '1.024'}
 51%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                               | 336/656 [15:56:19<15:10:35, 170.74s/it] 51%|████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                               | 337/656 [15:59:10<15:08:42, 170.92s/it]                                                                                                                                                                                                                                                  {'loss': '0.6946', 'grad_norm': '0.7534', 'learning_rate': '0.0001', 'ppl': '2.003', 'memory/max_active (GiB)': '54.91', 'memory/max_allocated (GiB)': '54.91', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '58.36', 'tokens/total': 444745216, 'tokens/trainable': 54275312, 'epoch': '1.027'}
 51%|████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                               | 337/656 [15:59:10<15:08:42, 170.92s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                               | 338/656 [16:02:02<15:06:55, 171.12s/it]                                                                                                                                                                                                                                                  {'loss': '0.701', 'grad_norm': '0.7645', 'learning_rate': '0.0001', 'ppl': '2.016', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.03', 'tokens/total': 446066176, 'tokens/trainable': 54438904, 'epoch': '1.031'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                               | 338/656 [16:02:02<15:06:55, 171.12s/it] 52%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                              | 339/656 [16:04:51<15:00:28, 170.44s/it]                                                                                                                                                                                                                                                  {'loss': '0.7514', 'grad_norm': '0.7828', 'learning_rate': '0.0001', 'ppl': '2.12', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '57.17', 'tokens/total': 447387136, 'tokens/trainable': 54588128, 'epoch': '1.034'}
 52%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                              | 339/656 [16:04:51<15:00:28, 170.44s/it] 52%|█████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                              | 340/656 [16:07:40<14:56:06, 170.15s/it]                                                                                                                                                                                                                                                  {'loss': '0.6718', 'grad_norm': '0.7467', 'learning_rate': '0.0001', 'ppl': '1.958', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '47.49', 'tokens/total': 448708096, 'tokens/trainable': 54743632, 'epoch': '1.037'}
 52%|█████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                              | 340/656 [16:07:40<14:56:06, 170.15s/it] 52%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                              | 341/656 [16:10:31<14:54:28, 170.38s/it]                                                                                                                                                                                                                                                  {'loss': '0.6707', 'grad_norm': '0.7358', 'learning_rate': '0.0001', 'ppl': '1.956', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '57.32', 'tokens/total': 450029056, 'tokens/trainable': 54895004, 'epoch': '1.04'}
 52%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                              | 341/656 [16:10:31<14:54:28, 170.38s/it] 52%|██████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                             | 342/656 [16:13:20<14:50:16, 170.11s/it]                                                                                                                                                                                                                                                  {'loss': '0.6587', 'grad_norm': '0.7234', 'learning_rate': '0.0001', 'ppl': '1.932', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.16', 'tokens/total': 451350016, 'tokens/trainable': 55061548, 'epoch': '1.043'}
 52%|██████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                             | 342/656 [16:13:20<14:50:16, 170.11s/it] 52%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                             | 343/656 [16:16:13<14:51:08, 170.82s/it]                                                                                                                                                                                                                                                  {'loss': '0.7114', 'grad_norm': '0.7252', 'learning_rate': '0.0001', 'ppl': '2.037', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '49.78', 'tokens/total': 452670976, 'tokens/trainable': 55230752, 'epoch': '1.046'}
 52%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                             | 343/656 [16:16:13<14:51:08, 170.82s/it] 52%|██████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                             | 344/656 [16:19:02<14:45:58, 170.38s/it]                                                                                                                                                                                                                                                  {'loss': '0.6737', 'grad_norm': '0.8472', 'learning_rate': '0.0001', 'ppl': '1.961', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '33.52', 'tokens/total': 453991936, 'tokens/trainable': 55380500, 'epoch': '1.049'}
 52%|██████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                             | 344/656 [16:19:02<14:45:58, 170.38s/it] 53%|███████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                             | 345/656 [16:21:54<14:45:19, 170.80s/it]                                                                                                                                                                                                                                                  {'loss': '0.6818', 'grad_norm': '0.7289', 'learning_rate': '0.0001', 'ppl': '1.977', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '31.31', 'tokens/total': 455312896, 'tokens/trainable': 55542672, 'epoch': '1.052'}
 53%|███████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                             | 345/656 [16:21:54<14:45:19, 170.80s/it] 53%|███████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                            | 346/656 [16:24:45<14:42:59, 170.90s/it]                                                                                                                                                                                                                                                  {'loss': '0.665', 'grad_norm': '0.7343', 'learning_rate': '0.0001', 'ppl': '1.944', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '47.07', 'tokens/total': 456633856, 'tokens/trainable': 55701056, 'epoch': '1.055'}
 53%|███████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                            | 346/656 [16:24:45<14:42:59, 170.90s/it] 53%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                            | 347/656 [16:27:35<14:38:15, 170.53s/it]                                                                                                                                                                                                                                                  {'loss': '0.7164', 'grad_norm': '0.7501', 'learning_rate': '0.0001', 'ppl': '2.047', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '46.55', 'tokens/total': 457954816, 'tokens/trainable': 55852616, 'epoch': '1.058'}
 53%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                            | 347/656 [16:27:35<14:38:15, 170.53s/it] 53%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                            | 348/656 [16:30:27<14:38:37, 171.16s/it]                                                                                                                                                                                                                                                  {'loss': '0.7012', 'grad_norm': '0.7971', 'learning_rate': '0.0001', 'ppl': '2.016', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '48.32', 'tokens/total': 459275776, 'tokens/trainable': 56006252, 'epoch': '1.061'}
 53%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                            | 348/656 [16:30:27<14:38:37, 171.16s/it] 53%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                           | 349/656 [16:33:18<14:35:12, 171.05s/it]                                                                                                                                                                                                                                                  {'loss': '0.6595', 'grad_norm': '0.6953', 'learning_rate': '0.0001', 'ppl': '1.934', 'memory/max_active (GiB)': '54.93', 'memory/max_allocated (GiB)': '54.93', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '23.37', 'tokens/total': 460596736, 'tokens/trainable': 56180848, 'epoch': '1.064'}
 53%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                           | 349/656 [16:33:18<14:35:12, 171.05s/it] 53%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                           | 350/656 [16:36:11<14:34:42, 171.51s/it]                                                                                                                                                                                                                                                  {'loss': '0.6491', 'grad_norm': '0.6885', 'learning_rate': '0.0001', 'ppl': '1.914', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '93.91', 'tokens/total': 461917696, 'tokens/trainable': 56372704, 'epoch': '1.067'}
 53%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                           | 350/656 [16:36:11<14:34:42, 171.51s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                           | 351/656 [16:39:02<14:31:02, 171.35s/it]                                                                                                                                                                                                                                                  {'loss': '0.7041', 'grad_norm': '0.7926', 'learning_rate': '0.0001', 'ppl': '2.022', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.69', 'tokens/total': 463238656, 'tokens/trainable': 56533076, 'epoch': '1.07'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                           | 351/656 [16:39:02<14:31:02, 171.35s/it] 54%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                          | 352/656 [16:41:52<14:26:36, 171.04s/it]                                                                                                                                                                                                                                                  {'loss': '0.6749', 'grad_norm': '0.7445', 'learning_rate': '0.0001', 'ppl': '1.964', 'memory/max_active (GiB)': '54.78', 'memory/max_allocated (GiB)': '54.78', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '35.17', 'tokens/total': 464559616, 'tokens/trainable': 56682516, 'epoch': '1.073'}
 54%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                          | 352/656 [16:41:52<14:26:36, 171.04s/it] 54%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                          | 353/656 [16:44:44<14:24:18, 171.15s/it]                                                                                                                                                                                                                                                  {'loss': '0.6957', 'grad_norm': '0.7337', 'learning_rate': '0.0001', 'ppl': '2.005', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '26.19', 'tokens/total': 465880576, 'tokens/trainable': 56845120, 'epoch': '1.076'}
 54%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                          | 353/656 [16:44:44<14:24:18, 171.15s/it] 54%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                          | 354/656 [16:47:34<14:20:20, 170.93s/it]                                                                                                                                                                                                                                                  {'loss': '0.6978', 'grad_norm': '0.7648', 'learning_rate': '0.0001', 'ppl': '2.009', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '39.7', 'tokens/total': 467201536, 'tokens/trainable': 57011772, 'epoch': '1.079'}
 54%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                          | 354/656 [16:47:34<14:20:20, 170.93s/it] 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                          | 355/656 [16:50:24<14:16:18, 170.69s/it]                                                                                                                                                                                                                                                  {'loss': '0.6818', 'grad_norm': '0.7146', 'learning_rate': '0.0001', 'ppl': '1.978', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '61.85', 'tokens/total': 468522496, 'tokens/trainable': 57184664, 'epoch': '1.082'}
 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                          | 355/656 [16:50:24<14:16:18, 170.69s/it] 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                         | 356/656 [16:53:15<14:14:13, 170.85s/it]                                                                                                                                                                                                                                                  {'loss': '0.6574', 'grad_norm': '0.7003', 'learning_rate': '0.0001', 'ppl': '1.93', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '61.79', 'tokens/total': 469843456, 'tokens/trainable': 57374928, 'epoch': '1.085'}
 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                         | 356/656 [16:53:15<14:14:13, 170.85s/it] 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                         | 357/656 [16:56:06<14:11:16, 170.82s/it]                                                                                                                                                                                                                                                  {'loss': '0.6555', 'grad_norm': '0.7175', 'learning_rate': '0.0001', 'ppl': '1.926', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.75', 'tokens/total': 471164416, 'tokens/trainable': 57549116, 'epoch': '1.088'}
 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                         | 357/656 [16:56:06<14:11:16, 170.82s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                         | 358/656 [16:58:55<14:05:08, 170.16s/it]                                                                                                                                                                                                                                                  {'loss': '0.7906', 'grad_norm': '0.8009', 'learning_rate': '0.0001', 'ppl': '2.205', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '32.81', 'tokens/total': 472485376, 'tokens/trainable': 57691496, 'epoch': '1.092'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                         | 358/656 [16:58:55<14:05:08, 170.16s/it] 55%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                        | 359/656 [17:01:45<14:02:07, 170.13s/it]                                                                                                                                                                                                                                                  {'loss': '0.6886', 'grad_norm': '0.7381', 'learning_rate': '0.0001', 'ppl': '1.991', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '31.9', 'tokens/total': 473806336, 'tokens/trainable': 57851600, 'epoch': '1.095'}
 55%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                        | 359/656 [17:01:45<14:02:07, 170.13s/it] 55%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                        | 360/656 [17:04:37<14:03:01, 170.88s/it]                                                                                                                                                                                                                                                  {'loss': '0.6972', 'grad_norm': '0.7281', 'learning_rate': '0.0001', 'ppl': '2.008', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.06', 'tokens/total': 475127296, 'tokens/trainable': 58017552, 'epoch': '1.098'}
 55%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                        | 360/656 [17:04:37<14:03:01, 170.88s/it] 55%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                        | 361/656 [17:07:28<14:00:23, 170.93s/it]                                                                                                                                                                                                                                                  {'loss': '0.6632', 'grad_norm': '0.6894', 'learning_rate': '0.0001', 'ppl': '1.941', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '100.3', 'tokens/total': 476448256, 'tokens/trainable': 58194660, 'epoch': '1.101'}
 55%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                        | 361/656 [17:07:28<14:00:23, 170.93s/it] 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                       | 362/656 [17:10:17<13:54:09, 170.24s/it]                                                                                                                                                                                                                                                  {'loss': '0.7277', 'grad_norm': '0.7729', 'learning_rate': '0.0001', 'ppl': '2.07', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '52.62', 'tokens/total': 477769216, 'tokens/trainable': 58344216, 'epoch': '1.104'}
 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                       | 362/656 [17:10:17<13:54:09, 170.24s/it] 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                       | 363/656 [17:13:08<13:52:59, 170.58s/it]                                                                                                                                                                                                                                                  {'loss': '0.7259', 'grad_norm': '0.7659', 'learning_rate': '0.0001', 'ppl': '2.067', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '39.35', 'tokens/total': 479090176, 'tokens/trainable': 58497528, 'epoch': '1.107'}
 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                       | 363/656 [17:13:08<13:52:59, 170.58s/it] 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                       | 364/656 [17:15:59<13:50:51, 170.72s/it]                                                                                                                                                                                                                                                  {'loss': '0.7077', 'grad_norm': '0.7662', 'learning_rate': '0.0001', 'ppl': '2.029', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '55.27', 'tokens/total': 480411136, 'tokens/trainable': 58649624, 'epoch': '1.11'}
 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                       | 364/656 [17:15:59<13:50:51, 170.72s/it] 56%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                       | 365/656 [17:18:51<13:48:51, 170.90s/it]                                                                                                                                                                                                                                                  {'loss': '0.6894', 'grad_norm': '0.7132', 'learning_rate': '0.0001', 'ppl': '1.993', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '64.18', 'tokens/total': 481732096, 'tokens/trainable': 58815464, 'epoch': '1.113'}
 56%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                       | 365/656 [17:18:51<13:48:51, 170.90s/it] 56%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                      | 366/656 [17:21:44<13:48:44, 171.46s/it]                                                                                                                                                                                                                                                  {'loss': '0.7725', 'grad_norm': '0.7483', 'learning_rate': '0.0001', 'ppl': '2.165', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '37.87', 'tokens/total': 483053056, 'tokens/trainable': 58975136, 'epoch': '1.116'}
 56%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                      | 366/656 [17:21:44<13:48:44, 171.46s/it] 56%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                      | 367/656 [17:24:37<13:48:05, 171.92s/it]                                                                                                                                                                                                                                                  {'loss': '0.6987', 'grad_norm': '0.6716', 'learning_rate': '0.0001', 'ppl': '2.011', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '58.31', 'tokens/total': 484374016, 'tokens/trainable': 59162576, 'epoch': '1.119'}
 56%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                      | 367/656 [17:24:37<13:48:05, 171.92s/it] 56%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                      | 368/656 [17:27:27<13:43:05, 171.48s/it]                                                                                                                                                                                                                                                  {'loss': '0.6942', 'grad_norm': '0.7223', 'learning_rate': '0.0001', 'ppl': '2.002', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '39.72', 'tokens/total': 485694976, 'tokens/trainable': 59325528, 'epoch': '1.122'}
 56%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                      | 368/656 [17:27:27<13:43:05, 171.48s/it] 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                     | 369/656 [17:30:20<13:41:45, 171.79s/it]                                                                                                                                                                                                                                                  {'loss': '0.6716', 'grad_norm': '0.7551', 'learning_rate': '0.0001', 'ppl': '1.957', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '31.71', 'tokens/total': 487015936, 'tokens/trainable': 59496600, 'epoch': '1.125'}
 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                     | 369/656 [17:30:20<13:41:45, 171.79s/it] 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                     | 370/656 [17:33:11<13:37:52, 171.58s/it]                                                                                                                                                                                                                                                  {'loss': '0.7235', 'grad_norm': '0.8083', 'learning_rate': '0.0001', 'ppl': '2.062', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '30.32', 'tokens/total': 488336896, 'tokens/trainable': 59639112, 'epoch': '1.128'}
 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                     | 370/656 [17:33:11<13:37:52, 171.58s/it] 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                     | 371/656 [17:36:01<13:32:40, 171.09s/it]                                                                                                                                                                                                                                                  {'loss': '0.6842', 'grad_norm': '0.7419', 'learning_rate': '0.0001', 'ppl': '1.982', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.78', 'tokens/total': 489657856, 'tokens/trainable': 59793584, 'epoch': '1.131'}
 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                     | 371/656 [17:36:01<13:32:40, 171.09s/it] 57%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                    | 372/656 [17:38:52<13:29:52, 171.10s/it]                                                                                                                                                                                                                                                  {'loss': '0.7329', 'grad_norm': '0.7356', 'learning_rate': '0.0001', 'ppl': '2.081', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '59.16', 'tokens/total': 490978816, 'tokens/trainable': 59973224, 'epoch': '1.134'}
 57%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                    | 372/656 [17:38:52<13:29:52, 171.10s/it] 57%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                    | 373/656 [17:41:44<13:29:21, 171.60s/it]                                                                                                                                                                                                                                                  {'loss': '0.6985', 'grad_norm': '0.7622', 'learning_rate': '0.0001', 'ppl': '2.011', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '33.68', 'tokens/total': 492299776, 'tokens/trainable': 60142688, 'epoch': '1.137'}
 57%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                    | 373/656 [17:41:44<13:29:21, 171.60s/it] 57%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                    | 374/656 [17:44:35<13:24:25, 171.16s/it]                                                                                                                                                                                                                                                  {'loss': '0.6846', 'grad_norm': '0.7233', 'learning_rate': '0.0001', 'ppl': '1.983', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '35.09', 'tokens/total': 493620736, 'tokens/trainable': 60303088, 'epoch': '1.14'}
 57%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                    | 374/656 [17:44:35<13:24:25, 171.16s/it] 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                    | 375/656 [17:47:27<13:22:47, 171.41s/it]                                                                                                                                                                                                                                                  {'loss': '0.7205', 'grad_norm': '0.7373', 'learning_rate': '0.0001', 'ppl': '2.055', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '47.95', 'tokens/total': 494941696, 'tokens/trainable': 60464376, 'epoch': '1.143'}
 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                    | 375/656 [17:47:27<13:22:47, 171.41s/it] 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                   | 376/656 [17:50:17<13:18:52, 171.19s/it]                                                                                                                                                                                                                                                  {'loss': '0.6675', 'grad_norm': '0.7427', 'learning_rate': '0.0001', 'ppl': '1.949', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '51.23', 'tokens/total': 496262656, 'tokens/trainable': 60623152, 'epoch': '1.146'}
 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                   | 376/656 [17:50:17<13:18:52, 171.19s/it] 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                   | 377/656 [17:53:09<13:16:44, 171.34s/it]                                                                                                                                                                                                                                                  {'loss': '0.7411', 'grad_norm': '0.7414', 'learning_rate': '0.0001', 'ppl': '2.098', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '53.72', 'tokens/total': 497583616, 'tokens/trainable': 60789584, 'epoch': '1.15'}
 57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                   | 377/656 [17:53:09<13:16:44, 171.34s/it] 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                   | 378/656 [17:55:59<13:12:35, 171.06s/it]                                                                                                                                                                                                                                                  {'loss': '0.7198', 'grad_norm': '0.7386', 'learning_rate': '0.0001', 'ppl': '2.054', 'memory/max_active (GiB)': '54.78', 'memory/max_allocated (GiB)': '54.78', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '56.26', 'tokens/total': 498904576, 'tokens/trainable': 60947072, 'epoch': '1.153'}
 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                   | 378/656 [17:55:59<13:12:35, 171.06s/it] 58%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                  | 379/656 [17:58:52<13:11:35, 171.46s/it]                                                                                                                                                                                                                                                  {'loss': '0.6913', 'grad_norm': '0.7574', 'learning_rate': '0.0001', 'ppl': '1.996', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '68.81', 'tokens/total': 500225536, 'tokens/trainable': 61107784, 'epoch': '1.156'}
 58%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                  | 379/656 [17:58:52<13:11:35, 171.46s/it] 58%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                  | 380/656 [18:01:42<13:07:37, 171.22s/it]                                                                                                                                                                                                                                                  {'loss': '0.7464', 'grad_norm': '0.7892', 'learning_rate': '0.0001', 'ppl': '2.109', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '21.47', 'tokens/total': 501546496, 'tokens/trainable': 61250872, 'epoch': '1.159'}
 58%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                  | 380/656 [18:01:42<13:07:37, 171.22s/it] 58%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                  | 381/656 [18:04:32<13:03:04, 170.85s/it]                                                                                                                                                                                                                                                  {'loss': '0.6945', 'grad_norm': '0.7665', 'learning_rate': '0.0001', 'ppl': '2.003', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '31.81', 'tokens/total': 502867456, 'tokens/trainable': 61400160, 'epoch': '1.162'}
 58%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                  | 381/656 [18:04:32<13:03:04, 170.85s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                 | 382/656 [18:07:22<12:58:31, 170.48s/it]                                                                                                                                                                                                                                                  {'loss': '0.6978', 'grad_norm': '0.7234', 'learning_rate': '0.0001', 'ppl': '2.009', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.07', 'tokens/total': 504188416, 'tokens/trainable': 61567188, 'epoch': '1.165'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                 | 382/656 [18:07:22<12:58:31, 170.48s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                 | 383/656 [18:10:14<12:57:38, 170.91s/it]                                                                                                                                                                                                                                                  {'loss': '0.7152', 'grad_norm': '0.7863', 'learning_rate': '0.0001', 'ppl': '2.045', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '54.41', 'tokens/total': 505509376, 'tokens/trainable': 61720032, 'epoch': '1.168'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                 | 383/656 [18:10:14<12:57:38, 170.91s/it] 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                 | 384/656 [18:13:08<12:59:12, 171.88s/it]                                                                                                                                                                                                                                                  {'loss': '0.7093', 'grad_norm': '0.7327', 'learning_rate': '0.0001', 'ppl': '2.033', 'memory/max_active (GiB)': '54.91', 'memory/max_allocated (GiB)': '54.91', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '46.03', 'tokens/total': 506830336, 'tokens/trainable': 61888440, 'epoch': '1.171'}
 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                 | 384/656 [18:13:08<12:59:12, 171.88s/it] 59%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                 | 385/656 [18:15:59<12:55:03, 171.60s/it]                                                                                                                                                                                                                                                  {'loss': '0.7083', 'grad_norm': '0.7365', 'learning_rate': '0.0001', 'ppl': '2.03', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '66.51', 'tokens/total': 508151296, 'tokens/trainable': 62043692, 'epoch': '1.174'}
 59%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                 | 385/656 [18:15:59<12:55:03, 171.60s/it] 59%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                | 386/656 [18:18:49<12:50:05, 171.13s/it]                                                                                                                                                                                                                                                  {'loss': '0.6935', 'grad_norm': '0.7227', 'learning_rate': '0.0001', 'ppl': '2.001', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.56', 'tokens/total': 509472256, 'tokens/trainable': 62212960, 'epoch': '1.177'}
 59%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                | 386/656 [18:18:49<12:50:05, 171.13s/it] 59%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                | 387/656 [18:21:40<12:47:18, 171.15s/it]                                                                                                                                                                                                                                                  {'loss': '0.675', 'grad_norm': '0.7094', 'learning_rate': '0.0001', 'ppl': '1.964', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '22.87', 'tokens/total': 510793216, 'tokens/trainable': 62384672, 'epoch': '1.18'}
 59%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                | 387/656 [18:21:40<12:47:18, 171.15s/it] 59%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                | 388/656 [18:24:29<12:40:45, 170.32s/it]                                                                                                                                                                                                                                                  {'loss': '0.6797', 'grad_norm': '0.8045', 'learning_rate': '0.0001', 'ppl': '1.973', 'memory/max_active (GiB)': '54.76', 'memory/max_allocated (GiB)': '54.76', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '31.78', 'tokens/total': 512114176, 'tokens/trainable': 62550480, 'epoch': '1.183'}
 59%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                | 388/656 [18:24:29<12:40:45, 170.32s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                               | 389/656 [18:27:19<12:38:36, 170.47s/it]                                                                                                                                                                                                                                                  {'loss': '0.6997', 'grad_norm': '0.7798', 'learning_rate': '0.0001', 'ppl': '2.013', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '48.26', 'tokens/total': 513435136, 'tokens/trainable': 62700168, 'epoch': '1.186'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                               | 389/656 [18:27:19<12:38:36, 170.47s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                               | 390/656 [18:30:10<12:36:24, 170.62s/it]                                                                                                                                                                                                                                                  {'loss': '0.7736', 'grad_norm': '3.707', 'learning_rate': '0.0001', 'ppl': '2.168', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '25.47', 'tokens/total': 514756096, 'tokens/trainable': 62861760, 'epoch': '1.189'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                               | 390/656 [18:30:10<12:36:24, 170.62s/it] 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                               | 391/656 [18:33:03<12:36:19, 171.25s/it]                                                                                                                                                                                                                                                  {'loss': '0.6856', 'grad_norm': '0.732', 'learning_rate': '0.0001', 'ppl': '1.985', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '63.27', 'tokens/total': 516077056, 'tokens/trainable': 63040732, 'epoch': '1.192'}
 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                               | 391/656 [18:33:03<12:36:19, 171.25s/it] 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                               | 392/656 [18:35:53<12:32:16, 170.97s/it]                                                                                                                                                                                                                                                  {'loss': '0.7231', 'grad_norm': '0.7873', 'learning_rate': '0.0001', 'ppl': '2.061', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '89.3', 'tokens/total': 517398016, 'tokens/trainable': 63195600, 'epoch': '1.195'}
 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                               | 392/656 [18:35:53<12:32:16, 170.97s/it] 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                              | 393/656 [18:38:46<12:31:05, 171.35s/it]                                                                                                                                                                                                                                                  {'loss': '0.7347', 'grad_norm': '0.7694', 'learning_rate': '0.0001', 'ppl': '2.085', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '52.92', 'tokens/total': 518718976, 'tokens/trainable': 63359784, 'epoch': '1.198'}
 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                              | 393/656 [18:38:46<12:31:05, 171.35s/it] 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                              | 394/656 [18:41:36<12:27:24, 171.16s/it]                                                                                                                                                                                                                                                  {'loss': '0.706', 'grad_norm': '0.7176', 'learning_rate': '0.0001', 'ppl': '2.026', 'memory/max_active (GiB)': '54.77', 'memory/max_allocated (GiB)': '54.77', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '41.63', 'tokens/total': 520039936, 'tokens/trainable': 63528752, 'epoch': '1.201'}
 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                              | 394/656 [18:41:36<12:27:24, 171.16s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                              | 395/656 [18:44:28<12:25:35, 171.40s/it]                                                                                                                                                                                                                                                  {'loss': '0.7033', 'grad_norm': '0.7141', 'learning_rate': '0.0001', 'ppl': '2.02', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.27', 'tokens/total': 521360896, 'tokens/trainable': 63701508, 'epoch': '1.204'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                              | 395/656 [18:44:28<12:25:35, 171.40s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                             | 396/656 [18:47:20<12:22:56, 171.45s/it]                                                                                                                                                                                                                                                  {'loss': '0.7096', 'grad_norm': '0.7512', 'learning_rate': '0.0001', 'ppl': '2.033', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '57.82', 'tokens/total': 522681856, 'tokens/trainable': 63856284, 'epoch': '1.208'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                             | 396/656 [18:47:20<12:22:56, 171.45s/it] 61%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                             | 397/656 [18:50:09<12:17:34, 170.87s/it]                                                                                                                                                                                                                                                  {'loss': '0.6726', 'grad_norm': '0.7241', 'learning_rate': '0.0001', 'ppl': '1.959', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '54.85', 'tokens/total': 524002816, 'tokens/trainable': 64023096, 'epoch': '1.211'}
 61%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                             | 397/656 [18:50:09<12:17:34, 170.87s/it] 61%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                             | 398/656 [18:53:02<12:16:36, 171.30s/it]                                                                                                                                                                                                                                                  {'loss': '0.7297', 'grad_norm': '0.7418', 'learning_rate': '0.0001', 'ppl': '2.074', 'memory/max_active (GiB)': '54.95', 'memory/max_allocated (GiB)': '54.95', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '28.26', 'tokens/total': 525323776, 'tokens/trainable': 64188896, 'epoch': '1.214'}
 61%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                             | 398/656 [18:53:02<12:16:36, 171.30s/it] 61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                            | 399/656 [18:55:51<12:11:23, 170.75s/it]                                                                                                                                                                                                                                                  {'loss': '0.7011', 'grad_norm': '0.7533', 'learning_rate': '0.0001', 'ppl': '2.016', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '33.11', 'tokens/total': 526644736, 'tokens/trainable': 64344360, 'epoch': '1.217'}
 61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                            | 399/656 [18:55:51<12:11:23, 170.75s/it] 61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 400/656 [18:58:40<12:06:22, 170.25s/it]                                                                                                                                                                                                                                                  {'loss': '0.7316', 'grad_norm': '0.7739', 'learning_rate': '0.0001', 'ppl': '2.079', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '42.95', 'tokens/total': 527965696, 'tokens/trainable': 64493976, 'epoch': '1.22'}
 61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 400/656 [18:58:40<12:06:22, 170.25s/it] 61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                            | 401/656 [19:01:31<12:03:31, 170.24s/it]                                                                                                                                                                                                                                                  {'loss': '0.7008', 'grad_norm': '0.7529', 'learning_rate': '0.0001', 'ppl': '2.015', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '52.22', 'tokens/total': 529286656, 'tokens/trainable': 64654760, 'epoch': '1.223'}
 61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                            | 401/656 [19:01:31<12:03:31, 170.24s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                            | 402/656 [19:04:21<12:01:28, 170.43s/it]                                                                                                                                                                                                                                                  {'loss': '0.7634', 'grad_norm': '0.777', 'learning_rate': '0.0001', 'ppl': '2.145', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '63.69', 'tokens/total': 530607616, 'tokens/trainable': 64817500, 'epoch': '1.226'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                            | 402/656 [19:04:21<12:01:28, 170.43s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                           | 403/656 [19:07:12<11:58:32, 170.40s/it]                                                                                                                                                                                                                                                  {'loss': '0.7743', 'grad_norm': '0.731', 'learning_rate': '0.0001', 'ppl': '2.169', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '62.59', 'tokens/total': 531928576, 'tokens/trainable': 64988760, 'epoch': '1.229'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                           | 403/656 [19:07:12<11:58:32, 170.40s/it] 62%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                           | 404/656 [19:10:03<11:57:12, 170.77s/it]                                                                                                                                                                                                                                                  {'loss': '0.6854', 'grad_norm': '0.7146', 'learning_rate': '0.0001', 'ppl': '1.984', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '34.89', 'tokens/total': 533249536, 'tokens/trainable': 65164528, 'epoch': '1.232'}
 62%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                           | 404/656 [19:10:03<11:57:12, 170.77s/it] 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                           | 405/656 [19:12:53<11:53:35, 170.58s/it]                                                                                                                                                                                                                                                  {'loss': '0.7322', 'grad_norm': '0.7443', 'learning_rate': '0.0001', 'ppl': '2.08', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '58.06', 'tokens/total': 534570496, 'tokens/trainable': 65323932, 'epoch': '1.235'}
 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                           | 405/656 [19:12:54<11:53:35, 170.58s/it] 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                          | 406/656 [19:15:43<11:49:09, 170.20s/it]                                                                                                                                                                                                                                                  {'loss': '0.6944', 'grad_norm': '0.7066', 'learning_rate': '0.0001', 'ppl': '2.002', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '70.79', 'tokens/total': 535891456, 'tokens/trainable': 65491608, 'epoch': '1.238'}
 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                          | 406/656 [19:15:43<11:49:09, 170.20s/it] 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                          | 407/656 [19:18:33<11:45:50, 170.08s/it]                                                                                                                                                                                                                                                  {'loss': '0.7509', 'grad_norm': '0.7443', 'learning_rate': '0.0001', 'ppl': '2.119', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '46.6', 'tokens/total': 537212416, 'tokens/trainable': 65653884, 'epoch': '1.241'}
 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                          | 407/656 [19:18:33<11:45:50, 170.08s/it] 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                          | 408/656 [19:21:24<11:44:53, 170.54s/it]                                                                                                                                                                                                                                                  {'loss': '0.6962', 'grad_norm': '0.7314', 'learning_rate': '0.0001', 'ppl': '2.006', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '39.37', 'tokens/total': 538533376, 'tokens/trainable': 65810584, 'epoch': '1.244'}
 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                          | 408/656 [19:21:24<11:44:53, 170.54s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                         | 409/656 [19:24:15<11:42:24, 170.62s/it]                                                                                                                                                                                                                                                  {'loss': '0.6938', 'grad_norm': '0.7347', 'learning_rate': '0.0001', 'ppl': '2.001', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.85', 'tokens/total': 539854336, 'tokens/trainable': 65970768, 'epoch': '1.247'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                         | 409/656 [19:24:15<11:42:24, 170.62s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                         | 410/656 [19:27:05<11:38:16, 170.31s/it]                                                                                                                                                                                                                                                  {'loss': '0.6811', 'grad_norm': '0.7468', 'learning_rate': '0.0001', 'ppl': '1.976', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.38', 'tokens/total': 541175296, 'tokens/trainable': 66135092, 'epoch': '1.25'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                         | 410/656 [19:27:05<11:38:16, 170.31s/it] 63%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                         | 411/656 [19:29:58<11:39:35, 171.33s/it]                                                                                                                                                                                                                                                  {'loss': '0.7159', 'grad_norm': '0.7334', 'learning_rate': '0.0001', 'ppl': '2.046', 'memory/max_active (GiB)': '54.94', 'memory/max_allocated (GiB)': '54.94', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '41.53', 'tokens/total': 542496256, 'tokens/trainable': 66301720, 'epoch': '1.253'}
 63%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                         | 411/656 [19:29:58<11:39:35, 171.33s/it] 63%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                         | 412/656 [19:32:48<11:35:16, 170.97s/it]                                                                                                                                                                                                                                                  {'loss': '0.731', 'grad_norm': '0.7418', 'learning_rate': '0.0001', 'ppl': '2.077', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.45', 'tokens/total': 543817216, 'tokens/trainable': 66467392, 'epoch': '1.256'}
 63%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                         | 412/656 [19:32:48<11:35:16, 170.97s/it] 63%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                        | 413/656 [19:35:39<11:31:43, 170.80s/it]                                                                                                                                                                                                                                                  {'loss': '0.6821', 'grad_norm': '0.7091', 'learning_rate': '0.0001', 'ppl': '1.978', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '41.95', 'tokens/total': 545138176, 'tokens/trainable': 66636796, 'epoch': '1.259'}
 63%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                        | 413/656 [19:35:39<11:31:43, 170.80s/it] 63%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                        | 414/656 [19:38:29<11:28:21, 170.67s/it]                                                                                                                                                                                                                                                  {'loss': '0.7224', 'grad_norm': '0.7323', 'learning_rate': '0.0001', 'ppl': '2.059', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '41.01', 'tokens/total': 546459136, 'tokens/trainable': 66799688, 'epoch': '1.262'}
 63%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                        | 414/656 [19:38:29<11:28:21, 170.67s/it] 63%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                        | 415/656 [19:41:20<11:25:41, 170.71s/it]                                                                                                                                                                                                                                                  {'loss': '0.6994', 'grad_norm': '0.76', 'learning_rate': '0.0001', 'ppl': '2.013', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '46.25', 'tokens/total': 547780096, 'tokens/trainable': 66953860, 'epoch': '1.265'}
 63%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                        | 415/656 [19:41:20<11:25:41, 170.71s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                       | 416/656 [19:44:11<11:23:42, 170.93s/it]                                                                                                                                                                                                                                                  {'loss': '0.7463', 'grad_norm': '0.7579', 'learning_rate': '0.0001', 'ppl': '2.109', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '68.97', 'tokens/total': 549101056, 'tokens/trainable': 67116576, 'epoch': '1.269'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                       | 416/656 [19:44:11<11:23:42, 170.93s/it] 64%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                       | 417/656 [19:47:03<11:21:48, 171.17s/it]                                                                                                                                                                                                                                                  {'loss': '0.7345', 'grad_norm': '0.7591', 'learning_rate': '0.0001', 'ppl': '2.084', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '39.33', 'tokens/total': 550422016, 'tokens/trainable': 67276160, 'epoch': '1.272'}
 64%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                       | 417/656 [19:47:03<11:21:48, 171.17s/it] 64%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                       | 418/656 [19:49:54<11:18:57, 171.16s/it]                                                                                                                                                                                                                                                  {'loss': '0.7277', 'grad_norm': '0.7071', 'learning_rate': '0.0001', 'ppl': '2.07', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '65.93', 'tokens/total': 551742976, 'tokens/trainable': 67451952, 'epoch': '1.275'}
 64%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                       | 418/656 [19:49:54<11:18:57, 171.16s/it] 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                      | 419/656 [19:52:48<11:18:40, 171.82s/it]                                                                                                                                                                                                                                                  {'loss': '0.7094', 'grad_norm': '0.7111', 'learning_rate': '0.0001', 'ppl': '2.033', 'memory/max_active (GiB)': '54.95', 'memory/max_allocated (GiB)': '54.95', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '31.49', 'tokens/total': 553063936, 'tokens/trainable': 67626512, 'epoch': '1.278'}
 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                      | 419/656 [19:52:48<11:18:40, 171.82s/it] 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                      | 420/656 [19:55:38<11:13:34, 171.25s/it]                                                                                                                                                                                                                                                  {'loss': '0.7359', 'grad_norm': '0.7572', 'learning_rate': '0.0001', 'ppl': '2.087', 'memory/max_active (GiB)': '54.77', 'memory/max_allocated (GiB)': '54.77', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '21.12', 'tokens/total': 554384896, 'tokens/trainable': 67780080, 'epoch': '1.281'}
 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                      | 420/656 [19:55:38<11:13:34, 171.25s/it] 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                      | 421/656 [19:58:29<11:11:14, 171.38s/it]                                                                                                                                                                                                                                                  {'loss': '0.6986', 'grad_norm': '0.7581', 'learning_rate': '0.0001', 'ppl': '2.011', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '41.94', 'tokens/total': 555705856, 'tokens/trainable': 67944064, 'epoch': '1.284'}
 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                      | 421/656 [19:58:30<11:11:14, 171.38s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                      | 422/656 [20:01:24<11:12:08, 172.34s/it]                                                                                                                                                                                                                                                  {'loss': '0.6837', 'grad_norm': '0.7057', 'learning_rate': '0.0001', 'ppl': '1.981', 'memory/max_active (GiB)': '54.96', 'memory/max_allocated (GiB)': '54.96', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '32.24', 'tokens/total': 557026816, 'tokens/trainable': 68114736, 'epoch': '1.287'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                      | 422/656 [20:01:24<11:12:08, 172.34s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                     | 423/656 [20:04:14<11:06:11, 171.55s/it]                                                                                                                                                                                                                                                  {'loss': '0.7354', 'grad_norm': '0.7426', 'learning_rate': '0.0001', 'ppl': '2.086', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '32.17', 'tokens/total': 558347776, 'tokens/trainable': 68268640, 'epoch': '1.29'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                     | 423/656 [20:04:14<11:06:11, 171.55s/it] 65%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                     | 424/656 [20:07:05<11:03:29, 171.59s/it]                                                                                                                                                                                                                                                  {'loss': '0.7152', 'grad_norm': '0.6809', 'learning_rate': '0.0001', 'ppl': '2.045', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '51.14', 'tokens/total': 559668736, 'tokens/trainable': 68454448, 'epoch': '1.293'}
 65%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                     | 424/656 [20:07:05<11:03:29, 171.59s/it] 65%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                     | 425/656 [20:09:58<11:01:22, 171.78s/it]                                                                                                                                                                                                                                                  {'loss': '0.7898', 'grad_norm': '0.7506', 'learning_rate': '0.0001', 'ppl': '2.203', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.96', 'tokens/total': 560989696, 'tokens/trainable': 68609536, 'epoch': '1.296'}
 65%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                     | 425/656 [20:09:58<11:01:22, 171.78s/it] 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                    | 426/656 [20:12:50<10:58:59, 171.91s/it]                                                                                                                                                                                                                                                  {'loss': '0.7704', 'grad_norm': '0.7703', 'learning_rate': '0.0001', 'ppl': '2.161', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '33.35', 'tokens/total': 562310656, 'tokens/trainable': 68759584, 'epoch': '1.299'}
 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                    | 426/656 [20:12:50<10:58:59, 171.91s/it] 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                    | 427/656 [20:15:41<10:54:57, 171.60s/it]                                                                                                                                                                                                                                                  {'loss': '0.6999', 'grad_norm': '0.7054', 'learning_rate': '0.0001', 'ppl': '2.013', 'memory/max_active (GiB)': '54.78', 'memory/max_allocated (GiB)': '54.78', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '28.27', 'tokens/total': 563631616, 'tokens/trainable': 68933088, 'epoch': '1.302'}
 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                    | 427/656 [20:15:41<10:54:57, 171.60s/it] 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                    | 428/656 [20:18:30<10:49:56, 171.04s/it]                                                                                                                                                                                                                                                  {'loss': '0.7313', 'grad_norm': '0.7353', 'learning_rate': '0.0001', 'ppl': '2.078', 'memory/max_active (GiB)': '54.78', 'memory/max_allocated (GiB)': '54.78', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '59.22', 'tokens/total': 564952576, 'tokens/trainable': 69095720, 'epoch': '1.305'}
 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                    | 428/656 [20:18:30<10:49:56, 171.04s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                   | 429/656 [20:21:18<10:43:43, 170.15s/it]                                                                                                                                                                                                                                                  {'loss': '0.7054', 'grad_norm': '0.7538', 'learning_rate': '0.0001', 'ppl': '2.025', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.54', 'tokens/total': 566273536, 'tokens/trainable': 69247376, 'epoch': '1.308'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                   | 429/656 [20:21:18<10:43:43, 170.15s/it] 66%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                   | 430/656 [20:24:12<10:45:11, 171.29s/it]                                                                                                                                                                                                                                                  {'loss': '0.7377', 'grad_norm': '0.6949', 'learning_rate': '0.0001', 'ppl': '2.091', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '48.66', 'tokens/total': 567594496, 'tokens/trainable': 69429624, 'epoch': '1.311'}
 66%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                   | 430/656 [20:24:12<10:45:11, 171.29s/it] 66%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                   | 431/656 [20:27:05<10:43:58, 171.73s/it]                                                                                                                                                                                                                                                  {'loss': '0.7309', 'grad_norm': '0.7523', 'learning_rate': '0.0001', 'ppl': '2.077', 'memory/max_active (GiB)': '54.93', 'memory/max_allocated (GiB)': '54.93', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.92', 'tokens/total': 568915456, 'tokens/trainable': 69584808, 'epoch': '1.314'}
 66%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                   | 431/656 [20:27:05<10:43:58, 171.73s/it] 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                   | 432/656 [20:29:57<10:41:46, 171.90s/it]                                                                                                                                                                                                                                                  {'loss': '0.7218', 'grad_norm': '0.7082', 'learning_rate': '0.0001', 'ppl': '2.058', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '32.33', 'tokens/total': 570236416, 'tokens/trainable': 69764416, 'epoch': '1.317'}
 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                   | 432/656 [20:29:57<10:41:46, 171.90s/it] 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                  | 433/656 [20:32:45<10:34:20, 170.67s/it]                                                                                                                                                                                                                                                  {'loss': '0.7434', 'grad_norm': '0.7725', 'learning_rate': '0.0001', 'ppl': '2.103', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '38.26', 'tokens/total': 571557376, 'tokens/trainable': 69910296, 'epoch': '1.32'}
 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                  | 433/656 [20:32:45<10:34:20, 170.67s/it] 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                  | 434/656 [20:35:36<10:31:24, 170.65s/it]                                                                                                                                                                                                                                                  {'loss': '0.7075', 'grad_norm': '0.7501', 'learning_rate': '0.0001', 'ppl': '2.029', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.34', 'tokens/total': 572878336, 'tokens/trainable': 70067440, 'epoch': '1.323'}
 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                  | 434/656 [20:35:36<10:31:24, 170.65s/it] 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                  | 435/656 [20:38:28<10:30:10, 171.09s/it]                                                                                                                                                                                                                                                  {'loss': '0.6935', 'grad_norm': '0.7174', 'learning_rate': '0.0001', 'ppl': '2.001', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '32.37', 'tokens/total': 574199296, 'tokens/trainable': 70230320, 'epoch': '1.327'}
 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                  | 435/656 [20:38:28<10:30:10, 171.09s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 436/656 [20:41:18<10:25:52, 170.69s/it]                                                                                                                                                                                                                                                  {'loss': '0.7215', 'grad_norm': '0.7355', 'learning_rate': '0.0001', 'ppl': '2.058', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '41.19', 'tokens/total': 575520256, 'tokens/trainable': 70393080, 'epoch': '1.33'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 436/656 [20:41:18<10:25:52, 170.69s/it] 67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                 | 437/656 [20:44:10<10:25:12, 171.29s/it]                                                                                                                                                                                                                                                  {'loss': '0.7642', 'grad_norm': '0.7577', 'learning_rate': '0.0001', 'ppl': '2.147', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '57.16', 'tokens/total': 576841216, 'tokens/trainable': 70544128, 'epoch': '1.333'}
 67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                 | 437/656 [20:44:10<10:25:12, 171.29s/it] 67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 438/656 [20:46:59<10:19:47, 170.59s/it]                                                                                                                                                                                                                                                  {'loss': '0.7748', 'grad_norm': '0.7387', 'learning_rate': '0.0001', 'ppl': '2.17', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '56.78', 'tokens/total': 578162176, 'tokens/trainable': 70716848, 'epoch': '1.336'}
 67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 438/656 [20:46:59<10:19:47, 170.59s/it] 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                | 439/656 [20:49:49<10:16:26, 170.45s/it]                                                                                                                                                                                                                                                  {'loss': '0.7159', 'grad_norm': '0.7374', 'learning_rate': '0.0001', 'ppl': '2.046', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '25.79', 'tokens/total': 579483136, 'tokens/trainable': 70874344, 'epoch': '1.339'}
 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                | 439/656 [20:49:49<10:16:26, 170.45s/it] 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                | 440/656 [20:52:42<10:15:22, 170.94s/it]                                                                                                                                                                                                                                                  {'loss': '0.763', 'grad_norm': '0.7338', 'learning_rate': '0.0001', 'ppl': '2.145', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '70.98', 'tokens/total': 580804096, 'tokens/trainable': 71036664, 'epoch': '1.342'}
 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                | 440/656 [20:52:42<10:15:22, 170.94s/it] 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                | 441/656 [20:55:33<10:12:43, 170.99s/it]                                                                                                                                                                                                                                                  {'loss': '0.7192', 'grad_norm': '0.7007', 'learning_rate': '0.0001', 'ppl': '2.053', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '62.3', 'tokens/total': 582125056, 'tokens/trainable': 71216112, 'epoch': '1.345'}
 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                | 441/656 [20:55:33<10:12:43, 170.99s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                | 442/656 [20:58:24<10:10:15, 171.10s/it]                                                                                                                                                                                                                                                  {'loss': '0.7743', 'grad_norm': '0.7779', 'learning_rate': '0.0001', 'ppl': '2.169', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '49.44', 'tokens/total': 583446016, 'tokens/trainable': 71367368, 'epoch': '1.348'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                | 442/656 [20:58:24<10:10:15, 171.10s/it] 68%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                               | 443/656 [21:01:16<10:07:56, 171.25s/it]                                                                                                                                                                                                                                                  {'loss': '0.7483', 'grad_norm': '0.7489', 'learning_rate': '0.0001', 'ppl': '2.113', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '61.63', 'tokens/total': 584766976, 'tokens/trainable': 71529712, 'epoch': '1.351'}
 68%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                               | 443/656 [21:01:16<10:07:56, 171.25s/it] 68%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                               | 444/656 [21:04:07<10:05:38, 171.41s/it]                                                                                                                                                                                                                                                  {'loss': '0.6902', 'grad_norm': '0.7307', 'learning_rate': '0.0001', 'ppl': '1.994', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '59.93', 'tokens/total': 586087936, 'tokens/trainable': 71690824, 'epoch': '1.354'}
 68%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                               | 444/656 [21:04:07<10:05:38, 171.41s/it] 68%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                               | 445/656 [21:06:59<10:02:44, 171.39s/it]                                                                                                                                                                                                                                                  {'loss': '0.7014', 'grad_norm': '0.7184', 'learning_rate': '0.0001', 'ppl': '2.016', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '51.5', 'tokens/total': 587408896, 'tokens/trainable': 71855552, 'epoch': '1.357'}
 68%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                               | 445/656 [21:06:59<10:02:44, 171.39s/it] 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                               | 446/656 [21:09:48<9:58:04, 170.88s/it]                                                                                                                                                                                                                                                  {'loss': '0.7066', 'grad_norm': '0.7444', 'learning_rate': '0.0001', 'ppl': '2.027', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.92', 'tokens/total': 588729856, 'tokens/trainable': 72017576, 'epoch': '1.36'}
 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                               | 446/656 [21:09:48<9:58:04, 170.88s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                              | 447/656 [21:12:40<9:55:41, 171.01s/it]                                                                                                                                                                                                                                                  {'loss': '0.7557', 'grad_norm': '0.7421', 'learning_rate': '0.0001', 'ppl': '2.129', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.17', 'tokens/total': 590050816, 'tokens/trainable': 72176640, 'epoch': '1.363'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                              | 447/656 [21:12:40<9:55:41, 171.01s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                              | 448/656 [21:15:30<9:51:46, 170.71s/it]                                                                                                                                                                                                                                                  {'loss': '0.6874', 'grad_norm': '0.7683', 'learning_rate': '0.0001', 'ppl': '1.989', 'memory/max_active (GiB)': '54.78', 'memory/max_allocated (GiB)': '54.78', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '33.1', 'tokens/total': 591371776, 'tokens/trainable': 72329304, 'epoch': '1.366'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                              | 448/656 [21:15:30<9:51:46, 170.71s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                              | 449/656 [21:18:22<9:50:21, 171.12s/it]                                                                                                                                                                                                                                                  {'loss': '0.7074', 'grad_norm': '0.7035', 'learning_rate': '0.0001', 'ppl': '2.029', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.9', 'tokens/total': 592692736, 'tokens/trainable': 72507688, 'epoch': '1.369'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                              | 449/656 [21:18:22<9:50:21, 171.12s/it] 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                             | 450/656 [21:21:14<9:48:31, 171.41s/it]                                                                                                                                                                                                                                                  {'loss': '0.7436', 'grad_norm': '0.742', 'learning_rate': '0.0001', 'ppl': '2.103', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '46.55', 'tokens/total': 594013696, 'tokens/trainable': 72660640, 'epoch': '1.372'}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                             | 450/656 [21:21:14<9:48:31, 171.41s/it] 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                             | 451/656 [21:24:04<9:44:27, 171.06s/it]                                                                                                                                                                                                                                                  {'loss': '0.7858', 'grad_norm': '0.7335', 'learning_rate': '0.0001', 'ppl': '2.194', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '45.35', 'tokens/total': 595334656, 'tokens/trainable': 72823920, 'epoch': '1.375'}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                             | 451/656 [21:24:04<9:44:27, 171.06s/it] 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                             | 452/656 [21:26:55<9:41:35, 171.06s/it]                                                                                                                                                                                                                                                  {'loss': '0.7797', 'grad_norm': '0.821', 'learning_rate': '0.0001', 'ppl': '2.181', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '41.37', 'tokens/total': 596655616, 'tokens/trainable': 72954568, 'epoch': '1.378'}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                             | 452/656 [21:26:55<9:41:35, 171.06s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                             | 453/656 [21:29:44<9:36:21, 170.35s/it]                                                                                                                                                                                                                                                  {'loss': '0.7313', 'grad_norm': '0.7428', 'learning_rate': '0.0001', 'ppl': '2.078', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '25.03', 'tokens/total': 597976576, 'tokens/trainable': 73113120, 'epoch': '1.381'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                             | 453/656 [21:29:44<9:36:21, 170.35s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                            | 454/656 [21:32:35<9:34:09, 170.54s/it]                                                                                                                                                                                                                                                  {'loss': '0.7019', 'grad_norm': '0.6904', 'learning_rate': '0.0001', 'ppl': '2.018', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '55.3', 'tokens/total': 599297536, 'tokens/trainable': 73291056, 'epoch': '1.384'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                            | 454/656 [21:32:35<9:34:09, 170.54s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                            | 455/656 [21:35:23<9:28:51, 169.81s/it]                                                                                                                                                                                                                                                  {'loss': '0.6749', 'grad_norm': '0.7863', 'learning_rate': '0.0001', 'ppl': '1.964', 'memory/max_active (GiB)': '54.77', 'memory/max_allocated (GiB)': '54.77', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '72.03', 'tokens/total': 600618496, 'tokens/trainable': 73432464, 'epoch': '1.388'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                            | 455/656 [21:35:23<9:28:51, 169.81s/it] 70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                            | 456/656 [21:38:14<9:26:48, 170.04s/it]                                                                                                                                                                                                                                                  {'loss': '0.7165', 'grad_norm': '0.7609', 'learning_rate': '0.0001', 'ppl': '2.047', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '64.79', 'tokens/total': 601939456, 'tokens/trainable': 73592408, 'epoch': '1.391'}
 70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                            | 456/656 [21:38:14<9:26:48, 170.04s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 457/656 [21:41:04<9:23:55, 170.03s/it]                                                                                                                                                                                                                                                  {'loss': '0.7515', 'grad_norm': '0.8149', 'learning_rate': '0.0001', 'ppl': '2.12', 'memory/max_active (GiB)': '54.77', 'memory/max_allocated (GiB)': '54.77', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '37.29', 'tokens/total': 603260416, 'tokens/trainable': 73726832, 'epoch': '1.394'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 457/656 [21:41:04<9:23:55, 170.03s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                           | 458/656 [21:43:56<9:23:03, 170.63s/it]                                                                                                                                                                                                                                                  {'loss': '0.6618', 'grad_norm': '0.7084', 'learning_rate': '0.0001', 'ppl': '1.938', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '56.18', 'tokens/total': 604581376, 'tokens/trainable': 73895144, 'epoch': '1.397'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                           | 458/656 [21:43:56<9:23:03, 170.63s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                           | 459/656 [21:46:47<9:20:54, 170.84s/it]                                                                                                                                                                                                                                                  {'loss': '0.751', 'grad_norm': '0.7148', 'learning_rate': '0.0001', 'ppl': '2.119', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '64.64', 'tokens/total': 605902336, 'tokens/trainable': 74068096, 'epoch': '1.4'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                           | 459/656 [21:46:47<9:20:54, 170.84s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                          | 460/656 [21:49:37<9:17:47, 170.75s/it]                                                                                                                                                                                                                                                  {'loss': '0.7407', 'grad_norm': '0.7934', 'learning_rate': '0.0001', 'ppl': '2.097', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '21.65', 'tokens/total': 607223296, 'tokens/trainable': 74215640, 'epoch': '1.403'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                          | 460/656 [21:49:37<9:17:47, 170.75s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                          | 461/656 [21:52:27<9:13:56, 170.45s/it]                                                                                                                                                                                                                                                  {'loss': '0.6814', 'grad_norm': '0.7153', 'learning_rate': '0.0001', 'ppl': '1.977', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '37.44', 'tokens/total': 608544256, 'tokens/trainable': 74375200, 'epoch': '1.406'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                          | 461/656 [21:52:27<9:13:56, 170.45s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                          | 462/656 [21:55:18<9:11:04, 170.43s/it]                                                                                                                                                                                                                                                  {'loss': '0.8437', 'grad_norm': '0.7578', 'learning_rate': '0.0001', 'ppl': '2.325', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '55.85', 'tokens/total': 609865216, 'tokens/trainable': 74535336, 'epoch': '1.409'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                          | 462/656 [21:55:18<9:11:04, 170.43s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                          | 463/656 [21:58:07<9:07:33, 170.22s/it]                                                                                                                                                                                                                                                  {'loss': '0.7191', 'grad_norm': '0.7214', 'learning_rate': '0.0001', 'ppl': '2.053', 'memory/max_active (GiB)': '54.78', 'memory/max_allocated (GiB)': '54.78', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '52.97', 'tokens/total': 611186176, 'tokens/trainable': 74700896, 'epoch': '1.412'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                          | 463/656 [21:58:07<9:07:33, 170.22s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                         | 464/656 [22:00:58<9:04:39, 170.21s/it]                                                                                                                                                                                                                                                  {'loss': '0.758', 'grad_norm': '0.792', 'learning_rate': '0.0001', 'ppl': '2.134', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '26.53', 'tokens/total': 612507136, 'tokens/trainable': 74845656, 'epoch': '1.415'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                         | 464/656 [22:00:58<9:04:39, 170.21s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                         | 465/656 [22:03:49<9:02:43, 170.49s/it]                                                                                                                                                                                                                                                  {'loss': '0.7617', 'grad_norm': '0.7931', 'learning_rate': '0.0001', 'ppl': '2.142', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '42.86', 'tokens/total': 613828096, 'tokens/trainable': 74995976, 'epoch': '1.418'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                         | 465/656 [22:03:49<9:02:43, 170.49s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                         | 466/656 [22:06:42<9:02:26, 171.30s/it]                                                                                                                                                                                                                                                  {'loss': '0.6982', 'grad_norm': '0.6992', 'learning_rate': '0.0001', 'ppl': '2.01', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '75.15', 'tokens/total': 615149056, 'tokens/trainable': 75174816, 'epoch': '1.421'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                         | 466/656 [22:06:42<9:02:26, 171.30s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                        | 467/656 [22:09:32<8:58:31, 170.96s/it]                                                                                                                                                                                                                                                  {'loss': '0.7622', 'grad_norm': '0.7598', 'learning_rate': '0.0001', 'ppl': '2.143', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '48.77', 'tokens/total': 616470016, 'tokens/trainable': 75331096, 'epoch': '1.424'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                        | 467/656 [22:09:32<8:58:31, 170.96s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                        | 468/656 [22:12:23<8:55:18, 170.84s/it]                                                                                                                                                                                                                                                  {'loss': '0.75', 'grad_norm': '0.7728', 'learning_rate': '0.0001', 'ppl': '2.117', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '59.19', 'tokens/total': 617790976, 'tokens/trainable': 75475344, 'epoch': '1.427'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                        | 468/656 [22:12:23<8:55:18, 170.84s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                        | 469/656 [22:15:15<8:53:53, 171.30s/it]                                                                                                                                                                                                                                                  {'loss': '0.7233', 'grad_norm': '0.7639', 'learning_rate': '0.0001', 'ppl': '2.061', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.52', 'tokens/total': 619111936, 'tokens/trainable': 75633200, 'epoch': '1.43'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                        | 469/656 [22:15:15<8:53:53, 171.30s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                       | 470/656 [22:18:04<8:48:55, 170.62s/it]                                                                                                                                                                                                                                                  {'loss': '0.7207', 'grad_norm': '0.7879', 'learning_rate': '0.0001', 'ppl': '2.056', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '28.4', 'tokens/total': 620432896, 'tokens/trainable': 75773856, 'epoch': '1.433'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                       | 470/656 [22:18:04<8:48:55, 170.62s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                       | 471/656 [22:20:55<8:46:39, 170.81s/it]                                                                                                                                                                                                                                                  {'loss': '0.7382', 'grad_norm': '0.7199', 'learning_rate': '0.0001', 'ppl': '2.092', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '61.89', 'tokens/total': 621753856, 'tokens/trainable': 75940312, 'epoch': '1.436'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                       | 471/656 [22:20:55<8:46:39, 170.81s/it] 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                       | 472/656 [22:23:43<8:41:16, 169.98s/it]                                                                                                                                                                                                                                                  {'loss': '0.7171', 'grad_norm': '0.7632', 'learning_rate': '0.0001', 'ppl': '2.048', 'memory/max_active (GiB)': '54.73', 'memory/max_allocated (GiB)': '54.73', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '38.22', 'tokens/total': 623074816, 'tokens/trainable': 76087704, 'epoch': '1.439'}
 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                       | 472/656 [22:23:43<8:41:16, 169.98s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                       | 473/656 [22:26:33<8:38:07, 169.88s/it]                                                                                                                                                                                                                                                  {'loss': '0.7234', 'grad_norm': '0.7412', 'learning_rate': '0.0001', 'ppl': '2.061', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.41', 'tokens/total': 624395776, 'tokens/trainable': 76251072, 'epoch': '1.442'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                       | 473/656 [22:26:33<8:38:07, 169.88s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                      | 474/656 [22:29:24<8:36:37, 170.31s/it]                                                                                                                                                                                                                                                  {'loss': '0.707', 'grad_norm': '0.7275', 'learning_rate': '0.0001', 'ppl': '2.028', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '47.09', 'tokens/total': 625716736, 'tokens/trainable': 76420304, 'epoch': '1.446'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                      | 474/656 [22:29:24<8:36:37, 170.31s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                      | 475/656 [22:32:15<8:33:52, 170.35s/it]                                                                                                                                                                                                                                                  {'loss': '0.7722', 'grad_norm': '0.732', 'learning_rate': '0.0001', 'ppl': '2.165', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '23.91', 'tokens/total': 627037696, 'tokens/trainable': 76586792, 'epoch': '1.449'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                      | 475/656 [22:32:15<8:33:52, 170.35s/it] 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 476/656 [22:35:05<8:31:07, 170.37s/it]                                                                                                                                                                                                                                                  {'loss': '0.7059', 'grad_norm': '0.7406', 'learning_rate': '0.0001', 'ppl': '2.026', 'memory/max_active (GiB)': '54.78', 'memory/max_allocated (GiB)': '54.78', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '27.11', 'tokens/total': 628358656, 'tokens/trainable': 76742896, 'epoch': '1.452'}
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 476/656 [22:35:05<8:31:07, 170.37s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                     | 477/656 [22:37:56<8:28:31, 170.45s/it]                                                                                                                                                                                                                                                  {'loss': '0.7059', 'grad_norm': '0.7294', 'learning_rate': '0.0001', 'ppl': '2.026', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.07', 'tokens/total': 629679616, 'tokens/trainable': 76901008, 'epoch': '1.455'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                     | 477/656 [22:37:56<8:28:31, 170.45s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                     | 478/656 [22:40:47<8:26:29, 170.73s/it]                                                                                                                                                                                                                                                  {'loss': '0.7697', 'grad_norm': '0.74', 'learning_rate': '0.0001', 'ppl': '2.159', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '47.51', 'tokens/total': 631000576, 'tokens/trainable': 77065536, 'epoch': '1.458'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                     | 478/656 [22:40:47<8:26:29, 170.73s/it] 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                     | 479/656 [22:43:38<8:23:42, 170.75s/it]                                                                                                                                                                                                                                                  {'loss': '0.7479', 'grad_norm': '0.7165', 'learning_rate': '0.0001', 'ppl': '2.113', 'memory/max_active (GiB)': '54.91', 'memory/max_allocated (GiB)': '54.91', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.5', 'tokens/total': 632321536, 'tokens/trainable': 77247472, 'epoch': '1.461'}
 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                     | 479/656 [22:43:38<8:23:42, 170.75s/it] 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                    | 480/656 [22:46:30<8:21:51, 171.09s/it]                                                                                                                                                                                                                                                  {'loss': '0.7254', 'grad_norm': '0.7255', 'learning_rate': '0.0001', 'ppl': '2.066', 'memory/max_active (GiB)': '54.95', 'memory/max_allocated (GiB)': '54.95', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '46.88', 'tokens/total': 633642496, 'tokens/trainable': 77414248, 'epoch': '1.464'}
 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                    | 480/656 [22:46:30<8:21:51, 171.09s/it] 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                    | 481/656 [22:49:21<8:18:47, 171.02s/it]                                                                                                                                                                                                                                                  {'loss': '0.6937', 'grad_norm': '0.7122', 'learning_rate': '0.0001', 'ppl': '2.001', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.1', 'tokens/total': 634963456, 'tokens/trainable': 77577016, 'epoch': '1.467'}
 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                    | 481/656 [22:49:21<8:18:47, 171.02s/it] 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 482/656 [22:52:13<8:17:08, 171.43s/it]                                                                                                                                                                                                                                                  {'loss': '0.7272', 'grad_norm': '0.7036', 'learning_rate': '0.0001', 'ppl': '2.069', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '72.66', 'tokens/total': 636284416, 'tokens/trainable': 77754848, 'epoch': '1.47'}
 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 482/656 [22:52:13<8:17:08, 171.43s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                    | 483/656 [22:55:04<8:14:05, 171.36s/it]                                                                                                                                                                                                                                                  {'loss': '0.7295', 'grad_norm': '0.7214', 'learning_rate': '0.0001', 'ppl': '2.074', 'memory/max_active (GiB)': '54.78', 'memory/max_allocated (GiB)': '54.78', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.88', 'tokens/total': 637605376, 'tokens/trainable': 77931544, 'epoch': '1.473'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                    | 483/656 [22:55:04<8:14:05, 171.36s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                   | 484/656 [22:57:56<8:11:54, 171.60s/it]                                                                                                                                                                                                                                                  {'loss': '0.7362', 'grad_norm': '0.732', 'learning_rate': '0.0001', 'ppl': '2.088', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '46', 'tokens/total': 638926336, 'tokens/trainable': 78094760, 'epoch': '1.476'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                   | 484/656 [22:57:56<8:11:54, 171.60s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 485/656 [23:00:47<8:08:01, 171.24s/it]                                                                                                                                                                                                                                                  {'loss': '0.7435', 'grad_norm': '0.7895', 'learning_rate': '0.0001', 'ppl': '2.103', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '26.93', 'tokens/total': 640247296, 'tokens/trainable': 78234608, 'epoch': '1.479'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 485/656 [23:00:47<8:08:01, 171.24s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                   | 486/656 [23:03:36<8:03:35, 170.68s/it]                                                                                                                                                                                                                                                  {'loss': '0.7037', 'grad_norm': '0.7511', 'learning_rate': '0.0001', 'ppl': '2.021', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '49.05', 'tokens/total': 641568256, 'tokens/trainable': 78377928, 'epoch': '1.482'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                   | 486/656 [23:03:36<8:03:35, 170.68s/it] 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                  | 487/656 [23:06:27<8:00:46, 170.69s/it]                                                                                                                                                                                                                                                  {'loss': '0.6738', 'grad_norm': '0.6728', 'learning_rate': '0.0001', 'ppl': '1.962', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '33.21', 'tokens/total': 642889216, 'tokens/trainable': 78566808, 'epoch': '1.485'}
 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                  | 487/656 [23:06:27<8:00:46, 170.69s/it] 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                  | 488/656 [23:09:18<7:58:34, 170.92s/it]                                                                                                                                                                                                                                                  {'loss': '0.7354', 'grad_norm': '0.7613', 'learning_rate': '0.0001', 'ppl': '2.086', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '52.28', 'tokens/total': 644210176, 'tokens/trainable': 78712888, 'epoch': '1.488'}
 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                  | 488/656 [23:09:18<7:58:34, 170.92s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                  | 489/656 [23:12:09<7:55:10, 170.72s/it]                                                                                                                                                                                                                                                  {'loss': '0.7175', 'grad_norm': '0.7243', 'learning_rate': '0.0001', 'ppl': '2.049', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '49.23', 'tokens/total': 645531136, 'tokens/trainable': 78877440, 'epoch': '1.491'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                  | 489/656 [23:12:09<7:55:10, 170.72s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                 | 490/656 [23:14:58<7:51:12, 170.32s/it]                                                                                                                                                                                                                                                  {'loss': '0.7727', 'grad_norm': '1.226', 'learning_rate': '0.0001', 'ppl': '2.166', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '37.14', 'tokens/total': 646852096, 'tokens/trainable': 79014888, 'epoch': '1.494'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                 | 490/656 [23:14:58<7:51:12, 170.32s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                 | 491/656 [23:17:47<7:47:43, 170.08s/it]                                                                                                                                                                                                                                                  {'loss': '0.7595', 'grad_norm': '0.7855', 'learning_rate': '0.0001', 'ppl': '2.137', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '32.46', 'tokens/total': 648173056, 'tokens/trainable': 79163704, 'epoch': '1.497'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                 | 491/656 [23:17:48<7:47:43, 170.08s/it] 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                 | 492/656 [23:20:39<7:45:52, 170.44s/it]                                                                                                                                                                                                                                                  {'loss': '0.7425', 'grad_norm': '0.7931', 'learning_rate': '0.0001', 'ppl': '2.101', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '45.89', 'tokens/total': 649494016, 'tokens/trainable': 79305128, 'epoch': '1.5'}
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                 | 492/656 [23:20:39<7:45:52, 170.44s/it] 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                 | 493/656 [23:23:30<7:43:28, 170.60s/it]                                                                                                                                                                                                                                                  {'loss': '0.7328', 'grad_norm': '0.7546', 'learning_rate': '0.0001', 'ppl': '2.081', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '42.55', 'tokens/total': 650814976, 'tokens/trainable': 79455552, 'epoch': '1.504'}
 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                 | 493/656 [23:23:30<7:43:28, 170.60s/it] 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                | 494/656 [23:26:19<7:39:44, 170.28s/it]                                                                                                                                                                                                                                                  {'loss': '0.6952', 'grad_norm': '0.7185', 'learning_rate': '0.0001', 'ppl': '2.004', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '52.96', 'tokens/total': 652135936, 'tokens/trainable': 79621864, 'epoch': '1.507'}
 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                | 494/656 [23:26:19<7:39:44, 170.28s/it] 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                | 495/656 [23:29:11<7:37:46, 170.60s/it]                                                                                                                                                                                                                                                  {'loss': '0.7118', 'grad_norm': '0.7015', 'learning_rate': '0.0001', 'ppl': '2.038', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '38.17', 'tokens/total': 653456896, 'tokens/trainable': 79797392, 'epoch': '1.51'}
 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                | 495/656 [23:29:11<7:37:46, 170.60s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                | 496/656 [23:32:01<7:34:28, 170.43s/it]                                                                                                                                                                                                                                                  {'loss': '0.7696', 'grad_norm': '0.7659', 'learning_rate': '0.0001', 'ppl': '2.159', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '45.35', 'tokens/total': 654777856, 'tokens/trainable': 79951184, 'epoch': '1.513'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                | 496/656 [23:32:01<7:34:28, 170.43s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                               | 497/656 [23:34:53<7:33:22, 171.08s/it]                                                                                                                                                                                                                                                  {'loss': '0.7437', 'grad_norm': '0.7093', 'learning_rate': '0.0001', 'ppl': '2.104', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '41.46', 'tokens/total': 656098816, 'tokens/trainable': 80123952, 'epoch': '1.516'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                               | 497/656 [23:34:53<7:33:22, 171.08s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                               | 498/656 [23:37:45<7:31:01, 171.27s/it]                                                                                                                                                                                                                                                  {'loss': '0.7662', 'grad_norm': '0.7682', 'learning_rate': '0.0001', 'ppl': '2.152', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '23.82', 'tokens/total': 657419776, 'tokens/trainable': 80265984, 'epoch': '1.519'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                               | 498/656 [23:37:45<7:31:01, 171.27s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                               | 499/656 [23:40:37<7:29:02, 171.61s/it]                                                                                                                                                                                                                                                  {'loss': '0.7303', 'grad_norm': '0.708', 'learning_rate': '0.0001', 'ppl': '2.076', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.18', 'tokens/total': 658740736, 'tokens/trainable': 80439984, 'epoch': '1.522'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                               | 499/656 [23:40:37<7:29:02, 171.61s/it] 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                              | 500/656 [23:43:28<7:25:35, 171.38s/it]                                                                                                                                                                                                                                                  {'loss': '0.736', 'grad_norm': '0.7677', 'learning_rate': '0.0001', 'ppl': '2.088', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '33.87', 'tokens/total': 660061696, 'tokens/trainable': 80588392, 'epoch': '1.525'}
 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                              | 500/656 [23:43:28<7:25:35, 171.38s/it] 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                              | 501/656 [23:46:20<7:22:58, 171.47s/it]                                                                                                                                                                                                                                                  {'loss': '0.7323', 'grad_norm': '0.7141', 'learning_rate': '0.0001', 'ppl': '2.08', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '48.29', 'tokens/total': 661382656, 'tokens/trainable': 80762632, 'epoch': '1.528'}
 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                              | 501/656 [23:46:20<7:22:58, 171.47s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                              | 502/656 [23:49:10<7:19:05, 171.07s/it]                                                                                                                                                                                                                                                  {'loss': '0.7884', 'grad_norm': '0.7801', 'learning_rate': '0.0001', 'ppl': '2.2', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.63', 'tokens/total': 662703616, 'tokens/trainable': 80911056, 'epoch': '1.531'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                              | 502/656 [23:49:10<7:19:05, 171.07s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                              | 503/656 [23:52:02<7:16:55, 171.35s/it]                                                                                                                                                                                                                                                  {'loss': '0.7183', 'grad_norm': '0.7306', 'learning_rate': '0.0001', 'ppl': '2.051', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '32.92', 'tokens/total': 664024576, 'tokens/trainable': 81079560, 'epoch': '1.534'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                              | 503/656 [23:52:02<7:16:55, 171.35s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                             | 504/656 [23:54:54<7:14:25, 171.49s/it]                                                                                                                                                                                                                                                  {'loss': '0.7394', 'grad_norm': '0.6865', 'learning_rate': '0.0001', 'ppl': '2.095', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '68.03', 'tokens/total': 665345536, 'tokens/trainable': 81261928, 'epoch': '1.537'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                             | 504/656 [23:54:54<7:14:25, 171.49s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                             | 505/656 [23:57:45<7:11:35, 171.50s/it]                                                                                                                                                                                                                                                  {'loss': '0.7165', 'grad_norm': '0.7361', 'learning_rate': '0.0001', 'ppl': '2.047', 'memory/max_active (GiB)': '54.92', 'memory/max_allocated (GiB)': '54.92', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.62', 'tokens/total': 666666496, 'tokens/trainable': 81424800, 'epoch': '1.54'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                             | 505/656 [23:57:45<7:11:35, 171.50s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                             | 506/656 [24:00:34<7:06:56, 170.78s/it]                                                                                                                                                                                                                                                  {'loss': '0.7006', 'grad_norm': '0.704', 'learning_rate': '0.0001', 'ppl': '2.015', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '41.77', 'tokens/total': 667987456, 'tokens/trainable': 81600760, 'epoch': '1.543'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                             | 506/656 [24:00:34<7:06:56, 170.78s/it] 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                            | 507/656 [24:03:26<7:04:22, 170.89s/it]                                                                                                                                                                                                                                                  {'loss': '0.6948', 'grad_norm': '0.7221', 'learning_rate': '0.0001', 'ppl': '2.003', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '41.88', 'tokens/total': 669308416, 'tokens/trainable': 81762792, 'epoch': '1.546'}
 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                            | 507/656 [24:03:26<7:04:22, 170.89s/it] 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                            | 508/656 [24:06:17<7:01:39, 170.95s/it]                                                                                                                                                                                                                                                  {'loss': '0.7402', 'grad_norm': '0.7484', 'learning_rate': '0.0001', 'ppl': '2.096', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '41.11', 'tokens/total': 670629376, 'tokens/trainable': 81924528, 'epoch': '1.549'}
 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                            | 508/656 [24:06:17<7:01:39, 170.95s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                            | 509/656 [24:09:09<6:59:54, 171.39s/it]                                                                                                                                                                                                                                                  {'loss': '0.7162', 'grad_norm': '0.6911', 'learning_rate': '0.0001', 'ppl': '2.047', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '37.57', 'tokens/total': 671950336, 'tokens/trainable': 82105152, 'epoch': '1.552'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                            | 509/656 [24:09:09<6:59:54, 171.39s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                           | 510/656 [24:12:00<6:56:19, 171.10s/it]                                                                                                                                                                                                                                                  {'loss': '0.748', 'grad_norm': '0.7541', 'learning_rate': '0.0001', 'ppl': '2.113', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '59.56', 'tokens/total': 673271296, 'tokens/trainable': 82251728, 'epoch': '1.555'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                           | 510/656 [24:12:00<6:56:19, 171.10s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                           | 511/656 [24:14:50<6:53:22, 171.05s/it]                                                                                                                                                                                                                                                  {'loss': '0.6735', 'grad_norm': '0.684', 'learning_rate': '0.0001', 'ppl': '1.961', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '62.14', 'tokens/total': 674592256, 'tokens/trainable': 82424128, 'epoch': '1.558'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                           | 511/656 [24:14:50<6:53:22, 171.05s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 512/656 [24:17:41<6:50:16, 170.95s/it]                                                                                                                                                                                                                                                  {'loss': '0.7509', 'grad_norm': '0.713', 'learning_rate': '0.0001', 'ppl': '2.119', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.39', 'tokens/total': 675913216, 'tokens/trainable': 82590864, 'epoch': '1.561'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 512/656 [24:17:41<6:50:16, 170.95s/it] 78%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                           | 513/656 [24:20:33<6:47:47, 171.10s/it]                                                                                                                                                                                                                                                  {'loss': '0.7433', 'grad_norm': '0.721', 'learning_rate': '0.0001', 'ppl': '2.103', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '58.45', 'tokens/total': 677234176, 'tokens/trainable': 82757456, 'epoch': '1.565'}
 78%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                           | 513/656 [24:20:33<6:47:47, 171.10s/it] 78%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 514/656 [24:23:25<6:45:44, 171.44s/it]                                                                                                                                                                                                                                                  {'loss': '0.7379', 'grad_norm': '0.6992', 'learning_rate': '0.0001', 'ppl': '2.092', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '73.6', 'tokens/total': 678555136, 'tokens/trainable': 82931000, 'epoch': '1.568'}
 78%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 514/656 [24:23:25<6:45:44, 171.44s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                          | 515/656 [24:26:17<6:43:07, 171.54s/it]                                                                                                                                                                                                                                                  {'loss': '0.7721', 'grad_norm': '0.7217', 'learning_rate': '0.0001', 'ppl': '2.164', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '51.94', 'tokens/total': 679876096, 'tokens/trainable': 83096072, 'epoch': '1.571'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                          | 515/656 [24:26:17<6:43:07, 171.54s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                          | 516/656 [24:29:07<6:39:22, 171.16s/it]                                                                                                                                                                                                                                                  {'loss': '0.6958', 'grad_norm': '0.7184', 'learning_rate': '0.0001', 'ppl': '2.005', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.8', 'tokens/total': 681197056, 'tokens/trainable': 83255424, 'epoch': '1.574'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                          | 516/656 [24:29:07<6:39:22, 171.16s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                         | 517/656 [24:32:00<6:37:55, 171.76s/it]                                                                                                                                                                                                                                                  {'loss': '0.7908', 'grad_norm': '0.7069', 'learning_rate': '0.0001', 'ppl': '2.205', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '47.57', 'tokens/total': 682518016, 'tokens/trainable': 83429848, 'epoch': '1.577'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                         | 517/656 [24:32:00<6:37:55, 171.76s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                         | 518/656 [24:34:50<6:33:58, 171.29s/it]                                                                                                                                                                                                                                                  {'loss': '0.7356', 'grad_norm': '0.7406', 'learning_rate': '0.0001', 'ppl': '2.087', 'memory/max_active (GiB)': '54.77', 'memory/max_allocated (GiB)': '54.77', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '69.95', 'tokens/total': 683838976, 'tokens/trainable': 83586104, 'epoch': '1.58'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                         | 518/656 [24:34:50<6:33:58, 171.29s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                         | 519/656 [24:37:42<6:31:17, 171.37s/it]                                                                                                                                                                                                                                                  {'loss': '0.6937', 'grad_norm': '0.7382', 'learning_rate': '0.0001', 'ppl': '2.001', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '32.52', 'tokens/total': 685159936, 'tokens/trainable': 83743584, 'epoch': '1.583'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                         | 519/656 [24:37:42<6:31:17, 171.37s/it] 79%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                        | 520/656 [24:40:31<6:26:59, 170.73s/it]                                                                                                                                                                                                                                                  {'loss': '0.7147', 'grad_norm': '0.7401', 'learning_rate': '0.0001', 'ppl': '2.043', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.87', 'tokens/total': 686480896, 'tokens/trainable': 83902360, 'epoch': '1.586'}
 79%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                        | 520/656 [24:40:31<6:26:59, 170.73s/it] 79%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                        | 521/656 [24:43:21<6:23:50, 170.60s/it]                                                                                                                                                                                                                                                  {'loss': '0.7551', 'grad_norm': '0.7636', 'learning_rate': '0.0001', 'ppl': '2.128', 'memory/max_active (GiB)': '54.78', 'memory/max_allocated (GiB)': '54.78', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '49.21', 'tokens/total': 687801856, 'tokens/trainable': 84056504, 'epoch': '1.589'}
 79%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                        | 521/656 [24:43:21<6:23:50, 170.60s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                        | 522/656 [24:46:11<6:20:28, 170.36s/it]                                                                                                                                                                                                                                                  {'loss': '0.6993', 'grad_norm': '0.7143', 'learning_rate': '0.0001', 'ppl': '2.012', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.64', 'tokens/total': 689122816, 'tokens/trainable': 84225696, 'epoch': '1.592'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                        | 522/656 [24:46:11<6:20:28, 170.36s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 523/656 [24:49:01<6:17:09, 170.15s/it]                                                                                                                                                                                                                                                  {'loss': '0.7281', 'grad_norm': '0.7542', 'learning_rate': '0.0001', 'ppl': '2.071', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.06', 'tokens/total': 690443776, 'tokens/trainable': 84377568, 'epoch': '1.595'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 523/656 [24:49:01<6:17:09, 170.15s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                       | 524/656 [24:51:51<6:14:19, 170.15s/it]                                                                                                                                                                                                                                                  {'loss': '0.7388', 'grad_norm': '0.7625', 'learning_rate': '0.0001', 'ppl': '2.093', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '35.88', 'tokens/total': 691764736, 'tokens/trainable': 84525504, 'epoch': '1.598'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                       | 524/656 [24:51:51<6:14:19, 170.15s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                       | 525/656 [24:54:41<6:11:10, 170.00s/it]                                                                                                                                                                                                                                                  {'loss': '0.6984', 'grad_norm': '0.7196', 'learning_rate': '0.0001', 'ppl': '2.01', 'memory/max_active (GiB)': '54.77', 'memory/max_allocated (GiB)': '54.77', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '67.57', 'tokens/total': 693085696, 'tokens/trainable': 84691504, 'epoch': '1.601'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                       | 525/656 [24:54:41<6:11:10, 170.00s/it] 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                       | 526/656 [24:57:34<6:10:44, 171.11s/it]                                                                                                                                                                                                                                                  {'loss': '0.7366', 'grad_norm': '0.7275', 'learning_rate': '0.0001', 'ppl': '2.089', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '55.93', 'tokens/total': 694406656, 'tokens/trainable': 84856528, 'epoch': '1.604'}
 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                       | 526/656 [24:57:34<6:10:44, 171.11s/it] 80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                      | 527/656 [25:00:25<6:07:54, 171.12s/it]                                                                                                                                                                                                                                                  {'loss': '0.6996', 'grad_norm': '0.7063', 'learning_rate': '0.0001', 'ppl': '2.013', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '51.65', 'tokens/total': 695727616, 'tokens/trainable': 85021424, 'epoch': '1.607'}
 80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                      | 527/656 [25:00:25<6:07:54, 171.12s/it] 80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                      | 528/656 [25:03:17<6:05:22, 171.27s/it]                                                                                                                                                                                                                                                  {'loss': '0.6986', 'grad_norm': '0.7305', 'learning_rate': '0.0001', 'ppl': '2.011', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '45.37', 'tokens/total': 697048576, 'tokens/trainable': 85185272, 'epoch': '1.61'}
 80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                      | 528/656 [25:03:17<6:05:22, 171.27s/it] 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                      | 529/656 [25:06:08<6:02:07, 171.08s/it]                                                                                                                                                                                                                                                  {'loss': '0.7113', 'grad_norm': '0.696', 'learning_rate': '0.0001', 'ppl': '2.037', 'memory/max_active (GiB)': '54.78', 'memory/max_allocated (GiB)': '54.78', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '28', 'tokens/total': 698369536, 'tokens/trainable': 85361656, 'epoch': '1.613'}
 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                      | 529/656 [25:06:08<6:02:07, 171.08s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 530/656 [25:09:00<5:59:49, 171.35s/it]                                                                                                                                                                                                                                                  {'loss': '0.764', 'grad_norm': '0.7416', 'learning_rate': '0.0001', 'ppl': '2.147', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '55.29', 'tokens/total': 699690496, 'tokens/trainable': 85515568, 'epoch': '1.616'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 530/656 [25:09:00<5:59:49, 171.35s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                     | 531/656 [25:11:49<5:55:54, 170.83s/it]                                                                                                                                                                                                                                                  {'loss': '0.7698', 'grad_norm': '0.7974', 'learning_rate': '0.0001', 'ppl': '2.159', 'memory/max_active (GiB)': '54.77', 'memory/max_allocated (GiB)': '54.77', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '30.7', 'tokens/total': 701011456, 'tokens/trainable': 85649088, 'epoch': '1.619'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                     | 531/656 [25:11:49<5:55:54, 170.83s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                     | 532/656 [25:14:39<5:52:16, 170.45s/it]                                                                                                                                                                                                                                                  {'loss': '0.76', 'grad_norm': '0.7562', 'learning_rate': '0.0001', 'ppl': '2.138', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.47', 'tokens/total': 702332416, 'tokens/trainable': 85799192, 'epoch': '1.623'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                     | 532/656 [25:14:39<5:52:16, 170.45s/it] 81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                     | 533/656 [25:17:28<5:48:31, 170.01s/it]                                                                                                                                                                                                                                                  {'loss': '0.8209', 'grad_norm': '0.763', 'learning_rate': '0.0001', 'ppl': '2.272', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '55.98', 'tokens/total': 703653376, 'tokens/trainable': 85956800, 'epoch': '1.626'}
 81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                     | 533/656 [25:17:28<5:48:31, 170.01s/it] 81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                    | 534/656 [25:20:19<5:46:31, 170.42s/it]                                                                                                                                                                                                                                                  {'loss': '0.7634', 'grad_norm': '0.7802', 'learning_rate': '0.0001', 'ppl': '2.146', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '31.54', 'tokens/total': 704974336, 'tokens/trainable': 86115888, 'epoch': '1.629'}
 81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                    | 534/656 [25:20:19<5:46:31, 170.42s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                    | 535/656 [25:23:11<5:44:24, 170.78s/it]                                                                                                                                                                                                                                                  {'loss': '0.7139', 'grad_norm': '0.7095', 'learning_rate': '0.0001', 'ppl': '2.042', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '62.97', 'tokens/total': 706295296, 'tokens/trainable': 86287016, 'epoch': '1.632'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                    | 535/656 [25:23:11<5:44:24, 170.78s/it] 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                    | 536/656 [25:26:00<5:40:49, 170.41s/it]                                                                                                                                                                                                                                                  {'loss': '0.7117', 'grad_norm': '0.7152', 'learning_rate': '0.0001', 'ppl': '2.037', 'memory/max_active (GiB)': '54.78', 'memory/max_allocated (GiB)': '54.78', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '49.83', 'tokens/total': 707616256, 'tokens/trainable': 86451112, 'epoch': '1.635'}
 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                    | 536/656 [25:26:00<5:40:49, 170.41s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                   | 537/656 [25:28:52<5:38:44, 170.80s/it]                                                                                                                                                                                                                                                  {'loss': '0.7321', 'grad_norm': '0.7451', 'learning_rate': '0.0001', 'ppl': '2.079', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '23.77', 'tokens/total': 708937216, 'tokens/trainable': 86593904, 'epoch': '1.638'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                   | 537/656 [25:28:52<5:38:44, 170.80s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                   | 538/656 [25:31:43<5:35:56, 170.81s/it]                                                                                                                                                                                                                                                  {'loss': '0.6813', 'grad_norm': '0.6915', 'learning_rate': '0.0001', 'ppl': '1.976', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '41.03', 'tokens/total': 710258176, 'tokens/trainable': 86763504, 'epoch': '1.641'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                   | 538/656 [25:31:43<5:35:56, 170.81s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                   | 539/656 [25:34:36<5:34:08, 171.35s/it]                                                                                                                                                                                                                                                  {'loss': '0.8111', 'grad_norm': '0.7441', 'learning_rate': '0.0001', 'ppl': '2.25', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '56.11', 'tokens/total': 711579136, 'tokens/trainable': 86927680, 'epoch': '1.644'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                   | 539/656 [25:34:36<5:34:08, 171.35s/it] 82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 540/656 [25:37:25<5:30:08, 170.76s/it]                                                                                                                                                                                                                                                  {'loss': '0.7121', 'grad_norm': '0.7563', 'learning_rate': '0.0001', 'ppl': '2.038', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.03', 'tokens/total': 712900096, 'tokens/trainable': 87075352, 'epoch': '1.647'}
 82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 540/656 [25:37:25<5:30:08, 170.76s/it] 82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 541/656 [25:40:15<5:26:56, 170.58s/it]                                                                                                                                                                                                                                                  {'loss': '0.7362', 'grad_norm': '0.7418', 'learning_rate': '0.0001', 'ppl': '2.088', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '41.9', 'tokens/total': 714221056, 'tokens/trainable': 87235272, 'epoch': '1.65'}
 82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 541/656 [25:40:15<5:26:56, 170.58s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 542/656 [25:43:05<5:23:49, 170.43s/it]                                                                                                                                                                                                                                                  {'loss': '0.7362', 'grad_norm': '0.7297', 'learning_rate': '0.0001', 'ppl': '2.088', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '52.93', 'tokens/total': 715542016, 'tokens/trainable': 87389936, 'epoch': '1.653'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 542/656 [25:43:05<5:23:49, 170.43s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                  | 543/656 [25:45:56<5:21:10, 170.53s/it]                                                                                                                                                                                                                                                  {'loss': '0.7401', 'grad_norm': '0.7651', 'learning_rate': '0.0001', 'ppl': '2.096', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '28.53', 'tokens/total': 716862976, 'tokens/trainable': 87530416, 'epoch': '1.656'}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                  | 543/656 [25:45:56<5:21:10, 170.53s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                 | 544/656 [25:48:48<5:18:58, 170.88s/it]                                                                                                                                                                                                                                                  {'loss': '0.7366', 'grad_norm': '0.7449', 'learning_rate': '0.0001', 'ppl': '2.089', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '54.83', 'tokens/total': 718183936, 'tokens/trainable': 87683488, 'epoch': '1.659'}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                 | 544/656 [25:48:48<5:18:58, 170.88s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                 | 545/656 [25:51:39<5:16:09, 170.90s/it]                                                                                                                                                                                                                                                  {'loss': '0.7509', 'grad_norm': '0.7464', 'learning_rate': '0.0001', 'ppl': '2.119', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '42.16', 'tokens/total': 719504896, 'tokens/trainable': 87839576, 'epoch': '1.662'}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                 | 545/656 [25:51:39<5:16:09, 170.90s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                 | 546/656 [25:54:31<5:14:21, 171.47s/it]                                                                                                                                                                                                                                                  {'loss': '0.7447', 'grad_norm': '0.7033', 'learning_rate': '0.0001', 'ppl': '2.106', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '66.65', 'tokens/total': 720825856, 'tokens/trainable': 88015528, 'epoch': '1.665'}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                 | 546/656 [25:54:31<5:14:21, 171.47s/it] 83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                | 547/656 [25:57:22<5:11:10, 171.29s/it]                                                                                                                                                                                                                                                  {'loss': '0.7522', 'grad_norm': '0.7079', 'learning_rate': '0.0001', 'ppl': '2.122', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '55.94', 'tokens/total': 722146816, 'tokens/trainable': 88187488, 'epoch': '1.668'}
 83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                | 547/656 [25:57:22<5:11:10, 171.29s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 548/656 [26:00:13<5:07:55, 171.07s/it]                                                                                                                                                                                                                                                  {'loss': '0.7355', 'grad_norm': '0.7375', 'learning_rate': '0.0001', 'ppl': '2.087', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '66.23', 'tokens/total': 723467776, 'tokens/trainable': 88349352, 'epoch': '1.671'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 548/656 [26:00:13<5:07:55, 171.07s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                | 549/656 [26:03:03<5:04:39, 170.84s/it]                                                                                                                                                                                                                                                  {'loss': '0.7102', 'grad_norm': '0.7022', 'learning_rate': '0.0001', 'ppl': '2.034', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '62.32', 'tokens/total': 724788736, 'tokens/trainable': 88524240, 'epoch': '1.674'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                | 549/656 [26:03:03<5:04:39, 170.84s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                               | 550/656 [26:05:52<5:00:50, 170.29s/it]                                                                                                                                                                                                                                                  {'loss': '0.7738', 'grad_norm': '0.7637', 'learning_rate': '0.0001', 'ppl': '2.168', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '68.16', 'tokens/total': 726109696, 'tokens/trainable': 88670976, 'epoch': '1.677'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                               | 550/656 [26:05:52<5:00:50, 170.29s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                               | 551/656 [26:08:42<4:57:34, 170.05s/it]                                                                                                                                                                                                                                                  {'loss': '0.6961', 'grad_norm': '0.7052', 'learning_rate': '0.0001', 'ppl': '2.006', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '72.81', 'tokens/total': 727430656, 'tokens/trainable': 88833352, 'epoch': '1.681'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                               | 551/656 [26:08:42<4:57:34, 170.05s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 552/656 [26:11:32<4:54:50, 170.10s/it]                                                                                                                                                                                                                                                  {'loss': '0.7178', 'grad_norm': '0.6864', 'learning_rate': '0.0001', 'ppl': '2.05', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.47', 'tokens/total': 728751616, 'tokens/trainable': 89004000, 'epoch': '1.684'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 552/656 [26:11:32<4:54:50, 170.10s/it] 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                               | 553/656 [26:14:22<4:52:16, 170.26s/it]                                                                                                                                                                                                                                                  {'loss': '0.7744', 'grad_norm': '0.7157', 'learning_rate': '0.0001', 'ppl': '2.169', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '42.74', 'tokens/total': 730072576, 'tokens/trainable': 89167488, 'epoch': '1.687'}
 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                               | 553/656 [26:14:22<4:52:16, 170.26s/it] 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                              | 554/656 [26:17:13<4:49:27, 170.27s/it]                                                                                                                                                                                                                                                  {'loss': '0.7083', 'grad_norm': '0.7148', 'learning_rate': '0.0001', 'ppl': '2.03', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '23.72', 'tokens/total': 731393536, 'tokens/trainable': 89330288, 'epoch': '1.69'}
 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                              | 554/656 [26:17:13<4:49:27, 170.27s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                              | 555/656 [26:20:04<4:47:13, 170.63s/it]                                                                                                                                                                                                                                                  {'loss': '0.7186', 'grad_norm': '0.7188', 'learning_rate': '0.0001', 'ppl': '2.052', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '34.59', 'tokens/total': 732714496, 'tokens/trainable': 89493680, 'epoch': '1.693'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                              | 555/656 [26:20:04<4:47:13, 170.63s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                              | 556/656 [26:22:57<4:45:14, 171.15s/it]                                                                                                                                                                                                                                                  {'loss': '0.7724', 'grad_norm': '0.7419', 'learning_rate': '0.0001', 'ppl': '2.165', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '55.55', 'tokens/total': 734035456, 'tokens/trainable': 89660728, 'epoch': '1.696'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                              | 556/656 [26:22:57<4:45:14, 171.15s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                             | 557/656 [26:25:47<4:42:10, 171.01s/it]                                                                                                                                                                                                                                                  {'loss': '0.7506', 'grad_norm': '0.7213', 'learning_rate': '0.0001', 'ppl': '2.118', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '38.4', 'tokens/total': 735356416, 'tokens/trainable': 89822488, 'epoch': '1.699'}
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                             | 557/656 [26:25:47<4:42:10, 171.01s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                             | 558/656 [26:28:41<4:40:36, 171.80s/it]                                                                                                                                                                                                                                                  {'loss': '0.7156', 'grad_norm': '0.7143', 'learning_rate': '0.0001', 'ppl': '2.045', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '72.35', 'tokens/total': 736677376, 'tokens/trainable': 89984400, 'epoch': '1.702'}
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                             | 558/656 [26:28:41<4:40:36, 171.80s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                             | 559/656 [26:31:31<4:36:53, 171.27s/it]                                                                                                                                                                                                                                                  {'loss': '0.7103', 'grad_norm': '0.7233', 'learning_rate': '0.0001', 'ppl': '2.035', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.18', 'tokens/total': 737998336, 'tokens/trainable': 90140952, 'epoch': '1.705'}
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                             | 559/656 [26:31:31<4:36:53, 171.27s/it] 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                            | 560/656 [26:34:19<4:32:18, 170.20s/it]                                                                                                                                                                                                                                                  {'loss': '0.7621', 'grad_norm': '0.7702', 'learning_rate': '0.0001', 'ppl': '2.143', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '32.25', 'tokens/total': 739319296, 'tokens/trainable': 90284032, 'epoch': '1.708'}
 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                            | 560/656 [26:34:19<4:32:18, 170.20s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                            | 561/656 [26:37:10<4:30:01, 170.54s/it]                                                                                                                                                                                                                                                  {'loss': '0.726', 'grad_norm': '0.719', 'learning_rate': '0.0001', 'ppl': '2.067', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '34.58', 'tokens/total': 740640256, 'tokens/trainable': 90453784, 'epoch': '1.711'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                            | 561/656 [26:37:10<4:30:01, 170.54s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                            | 562/656 [26:40:01<4:27:23, 170.67s/it]                                                                                                                                                                                                                                                  {'loss': '0.7256', 'grad_norm': '0.7252', 'learning_rate': '0.0001', 'ppl': '2.066', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '48.74', 'tokens/total': 741961216, 'tokens/trainable': 90619472, 'epoch': '1.714'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                            | 562/656 [26:40:01<4:27:23, 170.67s/it] 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                            | 563/656 [26:42:53<4:25:03, 171.00s/it]                                                                                                                                                                                                                                                  {'loss': '0.7641', 'grad_norm': '0.7689', 'learning_rate': '0.0001', 'ppl': '2.147', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '26.11', 'tokens/total': 743282176, 'tokens/trainable': 90767744, 'epoch': '1.717'}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                            | 563/656 [26:42:53<4:25:03, 171.00s/it] 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 564/656 [26:45:44<4:22:12, 171.00s/it]                                                                                                                                                                                                                                                  {'loss': '0.7522', 'grad_norm': '0.7312', 'learning_rate': '0.0001', 'ppl': '2.122', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '45.28', 'tokens/total': 744603136, 'tokens/trainable': 90931944, 'epoch': '1.72'}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 564/656 [26:45:44<4:22:12, 171.00s/it] 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                           | 565/656 [26:48:35<4:19:14, 170.93s/it]                                                                                                                                                                                                                                                  {'loss': '0.7569', 'grad_norm': '0.7456', 'learning_rate': '0.0001', 'ppl': '2.132', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '45.65', 'tokens/total': 745924096, 'tokens/trainable': 91095816, 'epoch': '1.723'}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                           | 565/656 [26:48:35<4:19:14, 170.93s/it] 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                           | 566/656 [26:51:25<4:16:13, 170.81s/it]                                                                                                                                                                                                                                                  {'loss': '0.772', 'grad_norm': '0.7447', 'learning_rate': '0.0001', 'ppl': '2.164', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '58.92', 'tokens/total': 747245056, 'tokens/trainable': 91248192, 'epoch': '1.726'}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                           | 566/656 [26:51:25<4:16:13, 170.81s/it] 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                          | 567/656 [26:54:16<4:13:30, 170.90s/it]                                                                                                                                                                                                                                                  {'loss': '0.7304', 'grad_norm': '0.7436', 'learning_rate': '0.0001', 'ppl': '2.076', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '20.86', 'tokens/total': 748566016, 'tokens/trainable': 91404544, 'epoch': '1.729'}
 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                          | 567/656 [26:54:16<4:13:30, 170.90s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 568/656 [26:57:07<4:10:45, 170.98s/it]                                                                                                                                                                                                                                                  {'loss': '0.8069', 'grad_norm': '0.7959', 'learning_rate': '0.0001', 'ppl': '2.241', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '29.54', 'tokens/total': 749886976, 'tokens/trainable': 91550032, 'epoch': '1.732'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 568/656 [26:57:07<4:10:45, 170.98s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                          | 569/656 [26:59:59<4:08:00, 171.04s/it]                                                                                                                                                                                                                                                  {'loss': '0.7432', 'grad_norm': '0.7019', 'learning_rate': '0.0001', 'ppl': '2.103', 'memory/max_active (GiB)': '54.76', 'memory/max_allocated (GiB)': '54.76', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '54.43', 'tokens/total': 751207936, 'tokens/trainable': 91723576, 'epoch': '1.735'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                          | 569/656 [26:59:59<4:08:00, 171.04s/it] 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 570/656 [27:02:48<4:04:28, 170.57s/it]                                                                                                                                                                                                                                                  {'loss': '0.7306', 'grad_norm': '0.7169', 'learning_rate': '0.0001', 'ppl': '2.076', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '41.65', 'tokens/total': 752528896, 'tokens/trainable': 91898856, 'epoch': '1.738'}
 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 570/656 [27:02:48<4:04:28, 170.57s/it] 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                         | 571/656 [27:05:38<4:01:29, 170.47s/it]                                                                                                                                                                                                                                                  {'loss': '0.7622', 'grad_norm': '0.7521', 'learning_rate': '0.0001', 'ppl': '2.143', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '60.25', 'tokens/total': 753849856, 'tokens/trainable': 92049504, 'epoch': '1.742'}
 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                         | 571/656 [27:05:38<4:01:29, 170.47s/it] 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                         | 572/656 [27:08:28<3:58:31, 170.38s/it]                                                                                                                                                                                                                                                  {'loss': '0.7439', 'grad_norm': '0.7158', 'learning_rate': '0.0001', 'ppl': '2.104', 'memory/max_active (GiB)': '54.78', 'memory/max_allocated (GiB)': '54.78', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '88.9', 'tokens/total': 755170816, 'tokens/trainable': 92212664, 'epoch': '1.745'}
 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                         | 572/656 [27:08:28<3:58:31, 170.38s/it] 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                         | 573/656 [27:11:21<3:56:26, 170.92s/it]                                                                                                                                                                                                                                                  {'loss': '0.738', 'grad_norm': '0.7461', 'learning_rate': '0.0001', 'ppl': '2.092', 'memory/max_active (GiB)': '54.93', 'memory/max_allocated (GiB)': '54.93', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '62.35', 'tokens/total': 756491776, 'tokens/trainable': 92361720, 'epoch': '1.748'}
 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                         | 573/656 [27:11:21<3:56:26, 170.92s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                        | 574/656 [27:14:10<3:52:53, 170.40s/it]                                                                                                                                                                                                                                                  {'loss': '0.7707', 'grad_norm': '0.7682', 'learning_rate': '0.0001', 'ppl': '2.161', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '24.26', 'tokens/total': 757812736, 'tokens/trainable': 92509832, 'epoch': '1.751'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                        | 574/656 [27:14:10<3:52:53, 170.40s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                        | 575/656 [27:17:00<3:49:49, 170.24s/it]                                                                                                                                                                                                                                                  {'loss': '0.7912', 'grad_norm': '0.7941', 'learning_rate': '0.0001', 'ppl': '2.206', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.68', 'tokens/total': 759133696, 'tokens/trainable': 92649536, 'epoch': '1.754'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                        | 575/656 [27:17:00<3:49:49, 170.24s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                        | 576/656 [27:19:50<3:46:56, 170.21s/it]                                                                                                                                                                                                                                                  {'loss': '0.7211', 'grad_norm': '0.7138', 'learning_rate': '0.0001', 'ppl': '2.057', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '56.62', 'tokens/total': 760454656, 'tokens/trainable': 92817184, 'epoch': '1.757'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                        | 576/656 [27:19:50<3:46:56, 170.21s/it] 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 577/656 [27:22:41<3:44:28, 170.49s/it]                                                                                                                                                                                                                                                  {'loss': '0.74', 'grad_norm': '0.7202', 'learning_rate': '0.0001', 'ppl': '2.096', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.3', 'tokens/total': 761775616, 'tokens/trainable': 92987960, 'epoch': '1.76'}
 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 577/656 [27:22:41<3:44:28, 170.49s/it] 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                       | 578/656 [27:25:33<3:42:14, 170.95s/it]                                                                                                                                                                                                                                                  {'loss': '0.7238', 'grad_norm': '0.6991', 'learning_rate': '0.0001', 'ppl': '2.062', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '39.88', 'tokens/total': 763096576, 'tokens/trainable': 93155232, 'epoch': '1.763'}
 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                       | 578/656 [27:25:33<3:42:14, 170.95s/it] 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                       | 579/656 [27:28:25<3:39:43, 171.21s/it]                                                                                                                                                                                                                                                  {'loss': '0.7077', 'grad_norm': '0.7146', 'learning_rate': '0.0001', 'ppl': '2.029', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '31.69', 'tokens/total': 764417536, 'tokens/trainable': 93321872, 'epoch': '1.766'}
 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                       | 579/656 [27:28:25<3:39:43, 171.21s/it] 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 580/656 [27:31:16<3:36:46, 171.14s/it]                                                                                                                                                                                                                                                  {'loss': '0.7218', 'grad_norm': '0.738', 'learning_rate': '0.0001', 'ppl': '2.058', 'memory/max_active (GiB)': '54.96', 'memory/max_allocated (GiB)': '54.96', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '38.5', 'tokens/total': 765738496, 'tokens/trainable': 93484992, 'epoch': '1.769'}
 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 580/656 [27:31:16<3:36:46, 171.14s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                      | 581/656 [27:34:06<3:33:41, 170.96s/it]                                                                                                                                                                                                                                                  {'loss': '0.7379', 'grad_norm': '0.7036', 'learning_rate': '0.0001', 'ppl': '2.092', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '59.72', 'tokens/total': 767059456, 'tokens/trainable': 93653304, 'epoch': '1.772'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                      | 581/656 [27:34:06<3:33:41, 170.96s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                      | 582/656 [27:36:55<3:29:54, 170.19s/it]                                                                                                                                                                                                                                                  {'loss': '0.7473', 'grad_norm': '0.755', 'learning_rate': '0.0001', 'ppl': '2.111', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '65.58', 'tokens/total': 768380416, 'tokens/trainable': 93809136, 'epoch': '1.775'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                      | 582/656 [27:36:55<3:29:54, 170.19s/it] 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                      | 583/656 [27:39:46<3:27:27, 170.51s/it]                                                                                                                                                                                                                                                  {'loss': '0.7313', 'grad_norm': '0.7239', 'learning_rate': '0.0001', 'ppl': '2.078', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '34.35', 'tokens/total': 769701376, 'tokens/trainable': 93959832, 'epoch': '1.778'}
 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                      | 583/656 [27:39:46<3:27:27, 170.51s/it] 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 584/656 [27:42:37<3:24:57, 170.80s/it]                                                                                                                                                                                                                                                  {'loss': '0.7533', 'grad_norm': '0.7394', 'learning_rate': '0.0001', 'ppl': '2.124', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '52.53', 'tokens/total': 771022336, 'tokens/trainable': 94111384, 'epoch': '1.781'}
 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 584/656 [27:42:37<3:24:57, 170.80s/it] 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                     | 585/656 [27:45:27<3:21:35, 170.36s/it]                                                                                                                                                                                                                                                  {'loss': '0.7525', 'grad_norm': '0.7197', 'learning_rate': '0.0001', 'ppl': '2.122', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '34.72', 'tokens/total': 772343296, 'tokens/trainable': 94276768, 'epoch': '1.784'}
 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                     | 585/656 [27:45:27<3:21:35, 170.36s/it] 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 586/656 [27:48:18<3:19:11, 170.73s/it]                                                                                                                                                                                                                                                  {'loss': '0.7216', 'grad_norm': '0.7544', 'learning_rate': '0.0001', 'ppl': '2.058', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '60.3', 'tokens/total': 773664256, 'tokens/trainable': 94428936, 'epoch': '1.787'}
 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 586/656 [27:48:18<3:19:11, 170.73s/it] 89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 587/656 [27:51:11<3:17:00, 171.31s/it]                                                                                                                                                                                                                                                  {'loss': '0.7128', 'grad_norm': '0.6993', 'learning_rate': '0.0001', 'ppl': '2.04', 'memory/max_active (GiB)': '54.95', 'memory/max_allocated (GiB)': '54.95', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.82', 'tokens/total': 774985216, 'tokens/trainable': 94598120, 'epoch': '1.79'}
 89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 587/656 [27:51:11<3:17:00, 171.31s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                    | 588/656 [27:54:02<3:14:10, 171.33s/it]                                                                                                                                                                                                                                                  {'loss': '0.7109', 'grad_norm': '0.7502', 'learning_rate': '0.0001', 'ppl': '2.036', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '41.92', 'tokens/total': 776306176, 'tokens/trainable': 94754576, 'epoch': '1.793'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                    | 588/656 [27:54:02<3:14:10, 171.33s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                    | 589/656 [27:56:52<3:10:42, 170.79s/it]                                                                                                                                                                                                                                                  {'loss': '0.8312', 'grad_norm': '0.7453', 'learning_rate': '0.0001', 'ppl': '2.296', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '45.01', 'tokens/total': 777627136, 'tokens/trainable': 94914512, 'epoch': '1.796'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                    | 589/656 [27:56:52<3:10:42, 170.79s/it] 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                   | 590/656 [27:59:42<3:07:29, 170.44s/it]                                                                                                                                                                                                                                                  {'loss': '0.7358', 'grad_norm': '0.7393', 'learning_rate': '0.0001', 'ppl': '2.087', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '26.98', 'tokens/total': 778948096, 'tokens/trainable': 95071872, 'epoch': '1.8'}
 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                   | 590/656 [27:59:42<3:07:29, 170.44s/it] 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                   | 591/656 [28:02:33<3:04:54, 170.68s/it]                                                                                                                                                                                                                                                  {'loss': '0.7733', 'grad_norm': '0.7551', 'learning_rate': '0.0001', 'ppl': '2.167', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '59.62', 'tokens/total': 780269056, 'tokens/trainable': 95220664, 'epoch': '1.803'}
 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                   | 591/656 [28:02:33<3:04:54, 170.68s/it] 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                   | 592/656 [28:05:23<3:01:50, 170.48s/it]                                                                                                                                                                                                                                                  {'loss': '0.7329', 'grad_norm': '0.7373', 'learning_rate': '0.0001', 'ppl': '2.081', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.5', 'tokens/total': 781590016, 'tokens/trainable': 95369664, 'epoch': '1.806'}
 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                   | 592/656 [28:05:23<3:01:50, 170.48s/it] 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                   | 593/656 [28:08:14<2:59:07, 170.60s/it]                                                                                                                                                                                                                                                  {'loss': '0.7393', 'grad_norm': '0.7358', 'learning_rate': '0.0001', 'ppl': '2.094', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '54.75', 'tokens/total': 782910976, 'tokens/trainable': 95522200, 'epoch': '1.809'}
 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                   | 593/656 [28:08:14<2:59:07, 170.60s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 594/656 [28:11:05<2:56:25, 170.74s/it]                                                                                                                                                                                                                                                  {'loss': '0.7292', 'grad_norm': '0.9782', 'learning_rate': '0.0001', 'ppl': '2.073', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '45.8', 'tokens/total': 784231936, 'tokens/trainable': 95674520, 'epoch': '1.812'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 594/656 [28:11:05<2:56:25, 170.74s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 595/656 [28:13:55<2:53:28, 170.63s/it]                                                                                                                                                                                                                                                  {'loss': '0.7101', 'grad_norm': '0.7019', 'learning_rate': '0.0001', 'ppl': '2.034', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '67.3', 'tokens/total': 785552896, 'tokens/trainable': 95841120, 'epoch': '1.815'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 595/656 [28:13:55<2:53:28, 170.63s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 596/656 [28:16:47<2:50:52, 170.88s/it]                                                                                                                                                                                                                                                  {'loss': '0.762', 'grad_norm': '0.7211', 'learning_rate': '0.0001', 'ppl': '2.143', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '60.86', 'tokens/total': 786873856, 'tokens/trainable': 96003800, 'epoch': '1.818'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 596/656 [28:16:47<2:50:52, 170.88s/it] 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                 | 597/656 [28:19:35<2:47:26, 170.28s/it]                                                                                                                                                                                                                                                  {'loss': '0.7106', 'grad_norm': '0.7433', 'learning_rate': '0.0001', 'ppl': '2.035', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.99', 'tokens/total': 788194816, 'tokens/trainable': 96144880, 'epoch': '1.821'}
 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                 | 597/656 [28:19:35<2:47:26, 170.28s/it] 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 598/656 [28:22:29<2:45:25, 171.14s/it]                                                                                                                                                                                                                                                  {'loss': '0.7574', 'grad_norm': '0.7112', 'learning_rate': '0.0001', 'ppl': '2.133', 'memory/max_active (GiB)': '54.91', 'memory/max_allocated (GiB)': '54.91', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '72.9', 'tokens/total': 789515776, 'tokens/trainable': 96301768, 'epoch': '1.824'}
 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 598/656 [28:22:29<2:45:25, 171.14s/it] 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 599/656 [28:25:19<2:42:16, 170.82s/it]                                                                                                                                                                                                                                                  {'loss': '0.7939', 'grad_norm': '0.7753', 'learning_rate': '0.0001', 'ppl': '2.212', 'memory/max_active (GiB)': '54.77', 'memory/max_allocated (GiB)': '54.77', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '53.57', 'tokens/total': 790836736, 'tokens/trainable': 96445304, 'epoch': '1.827'}
 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 599/656 [28:25:19<2:42:16, 170.82s/it] 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                | 600/656 [28:28:11<2:39:53, 171.31s/it]                                                                                                                                                                                                                                                  {'loss': '0.7526', 'grad_norm': '0.7012', 'learning_rate': '0.0001', 'ppl': '2.123', 'memory/max_active (GiB)': '54.91', 'memory/max_allocated (GiB)': '54.91', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.22', 'tokens/total': 792157696, 'tokens/trainable': 96609192, 'epoch': '1.83'}
 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                | 600/656 [28:28:11<2:39:53, 171.31s/it] 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 601/656 [28:31:04<2:37:20, 171.65s/it]                                                                                                                                                                                                                                                  {'loss': '0.7412', 'grad_norm': '0.6884', 'learning_rate': '0.0001', 'ppl': '2.098', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.54', 'tokens/total': 793478656, 'tokens/trainable': 96781312, 'epoch': '1.833'}
 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 601/656 [28:31:04<2:37:20, 171.65s/it] 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 602/656 [28:33:54<2:34:09, 171.28s/it]                                                                                                                                                                                                                                                  {'loss': '0.7187', 'grad_norm': '0.7121', 'learning_rate': '0.0001', 'ppl': '2.052', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '66.22', 'tokens/total': 794799616, 'tokens/trainable': 96943344, 'epoch': '1.836'}
 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 602/656 [28:33:54<2:34:09, 171.28s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                | 603/656 [28:36:46<2:31:24, 171.41s/it]                                                                                                                                                                                                                                                  {'loss': '0.7572', 'grad_norm': '0.716', 'learning_rate': '0.0001', 'ppl': '2.132', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '62.54', 'tokens/total': 796120576, 'tokens/trainable': 97106928, 'epoch': '1.839'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                | 603/656 [28:36:46<2:31:24, 171.41s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍               | 604/656 [28:39:38<2:28:41, 171.57s/it]                                                                                                                                                                                                                                                  {'loss': '0.7248', 'grad_norm': '0.6836', 'learning_rate': '0.0001', 'ppl': '2.064', 'memory/max_active (GiB)': '54.94', 'memory/max_allocated (GiB)': '54.94', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.11', 'tokens/total': 797441536, 'tokens/trainable': 97280632, 'epoch': '1.842'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍               | 604/656 [28:39:38<2:28:41, 171.57s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋               | 605/656 [28:42:30<2:26:01, 171.80s/it]                                                                                                                                                                                                                                                  {'loss': '0.7377', 'grad_norm': '0.6951', 'learning_rate': '0.0001', 'ppl': '2.091', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.95', 'tokens/total': 798762496, 'tokens/trainable': 97456128, 'epoch': '1.845'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋               | 605/656 [28:42:30<2:26:01, 171.80s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉               | 606/656 [28:45:20<2:22:49, 171.39s/it]                                                                                                                                                                                                                                                  {'loss': '0.7141', 'grad_norm': '0.6891', 'learning_rate': '0.0001', 'ppl': '2.042', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.91', 'tokens/total': 800083456, 'tokens/trainable': 97625368, 'epoch': '1.848'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉               | 606/656 [28:45:21<2:22:49, 171.39s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 607/656 [28:48:12<2:19:54, 171.32s/it]                                                                                                                                                                                                                                                  {'loss': '0.7544', 'grad_norm': '0.6946', 'learning_rate': '0.0001', 'ppl': '2.126', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '51.22', 'tokens/total': 801404416, 'tokens/trainable': 97798376, 'epoch': '1.851'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 607/656 [28:48:12<2:19:54, 171.32s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌              | 608/656 [28:51:01<2:16:42, 170.88s/it]                                                                                                                                                                                                                                                  {'loss': '0.7681', 'grad_norm': '0.7413', 'learning_rate': '0.0001', 'ppl': '2.156', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '48.39', 'tokens/total': 802725376, 'tokens/trainable': 97954632, 'epoch': '1.854'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌              | 608/656 [28:51:01<2:16:42, 170.88s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉              | 609/656 [28:53:52<2:13:48, 170.82s/it]                                                                                                                                                                                                                                                  {'loss': '0.742', 'grad_norm': '0.6836', 'learning_rate': '0.0001', 'ppl': '2.1', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '75.27', 'tokens/total': 804046336, 'tokens/trainable': 98130784, 'epoch': '1.857'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉              | 609/656 [28:53:52<2:13:48, 170.82s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏             | 610/656 [28:56:42<2:10:43, 170.52s/it]                                                                                                                                                                                                                                                  {'loss': '0.7539', 'grad_norm': '0.7728', 'learning_rate': '0.0001', 'ppl': '2.125', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '27.41', 'tokens/total': 805367296, 'tokens/trainable': 98279688, 'epoch': '1.861'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏             | 610/656 [28:56:42<2:10:43, 170.52s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍             | 611/656 [28:59:31<2:07:33, 170.09s/it]                                                                                                                                                                                                                                                  {'loss': '0.7325', 'grad_norm': '0.7301', 'learning_rate': '0.0001', 'ppl': '2.08', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '53.79', 'tokens/total': 806688256, 'tokens/trainable': 98436344, 'epoch': '1.864'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍             | 611/656 [28:59:31<2:07:33, 170.09s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 612/656 [29:02:21<2:04:44, 170.09s/it]                                                                                                                                                                                                                                                  {'loss': '0.7752', 'grad_norm': '0.7368', 'learning_rate': '0.0001', 'ppl': '2.171', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '48.11', 'tokens/total': 808009216, 'tokens/trainable': 98594688, 'epoch': '1.867'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 612/656 [29:02:21<2:04:44, 170.09s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 613/656 [29:05:13<2:02:15, 170.60s/it]                                                                                                                                                                                                                                                  {'loss': '0.7362', 'grad_norm': '0.7006', 'learning_rate': '0.0001', 'ppl': '2.088', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '43.41', 'tokens/total': 809330176, 'tokens/trainable': 98759336, 'epoch': '1.87'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 613/656 [29:05:13<2:02:15, 170.60s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍            | 614/656 [29:08:03<1:59:20, 170.49s/it]                                                                                                                                                                                                                                                  {'loss': '0.7459', 'grad_norm': '0.7442', 'learning_rate': '0.0001', 'ppl': '2.108', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '78.62', 'tokens/total': 810651136, 'tokens/trainable': 98910680, 'epoch': '1.873'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍            | 614/656 [29:08:03<1:59:20, 170.49s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 615/656 [29:10:55<1:56:47, 170.93s/it]                                                                                                                                                                                                                                                  {'loss': '0.715', 'grad_norm': '0.6976', 'learning_rate': '0.0001', 'ppl': '2.044', 'memory/max_active (GiB)': '54.92', 'memory/max_allocated (GiB)': '54.92', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '38.3', 'tokens/total': 811972096, 'tokens/trainable': 99083360, 'epoch': '1.876'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 615/656 [29:10:55<1:56:47, 170.93s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉            | 616/656 [29:13:46<1:53:53, 170.83s/it]                                                                                                                                                                                                                                                  {'loss': '0.7304', 'grad_norm': '0.7342', 'learning_rate': '0.0001', 'ppl': '2.076', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '53.18', 'tokens/total': 813293056, 'tokens/trainable': 99243600, 'epoch': '1.879'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉            | 616/656 [29:13:46<1:53:53, 170.83s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎           | 617/656 [29:16:38<1:51:16, 171.20s/it]                                                                                                                                                                                                                                                  {'loss': '0.7631', 'grad_norm': '0.7002', 'learning_rate': '0.0001', 'ppl': '2.145', 'memory/max_active (GiB)': '54.92', 'memory/max_allocated (GiB)': '54.92', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '56.27', 'tokens/total': 814614016, 'tokens/trainable': 99422312, 'epoch': '1.882'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎           | 617/656 [29:16:38<1:51:16, 171.20s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 618/656 [29:19:27<1:48:06, 170.70s/it]                                                                                                                                                                                                                                                  {'loss': '0.7253', 'grad_norm': '0.7381', 'learning_rate': '0.0001', 'ppl': '2.065', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '31.49', 'tokens/total': 815934976, 'tokens/trainable': 99575728, 'epoch': '1.885'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 618/656 [29:19:27<1:48:06, 170.70s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉           | 619/656 [29:22:19<1:45:22, 170.88s/it]                                                                                                                                                                                                                                                  {'loss': '0.8288', 'grad_norm': '0.7369', 'learning_rate': '0.0001', 'ppl': '2.291', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '44.77', 'tokens/total': 817255936, 'tokens/trainable': 99739768, 'epoch': '1.888'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉           | 619/656 [29:22:19<1:45:22, 170.88s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 620/656 [29:25:11<1:42:43, 171.20s/it]                                                                                                                                                                                                                                                  {'loss': '0.7559', 'grad_norm': '0.7417', 'learning_rate': '0.0001', 'ppl': '2.129', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '61.76', 'tokens/total': 818576896, 'tokens/trainable': 99891592, 'epoch': '1.891'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 620/656 [29:25:11<1:42:43, 171.20s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍          | 621/656 [29:28:02<1:39:53, 171.24s/it]                                                                                                                                                                                                                                                  {'loss': '0.7449', 'grad_norm': '0.686', 'learning_rate': '0.0001', 'ppl': '2.106', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '61.05', 'tokens/total': 819897856, 'tokens/trainable': 100066800, 'epoch': '1.894'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍          | 621/656 [29:28:02<1:39:53, 171.24s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 622/656 [29:30:56<1:37:36, 172.26s/it]                                                                                                                                                                                                                                                  {'loss': '0.7545', 'grad_norm': '0.7421', 'learning_rate': '0.0001', 'ppl': '2.127', 'memory/max_active (GiB)': '54.91', 'memory/max_allocated (GiB)': '54.91', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '52.7', 'tokens/total': 821218816, 'tokens/trainable': 100220072, 'epoch': '1.897'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 622/656 [29:30:56<1:37:36, 172.26s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████          | 623/656 [29:33:49<1:34:43, 172.22s/it]                                                                                                                                                                                                                                                  {'loss': '0.7644', 'grad_norm': '0.7203', 'learning_rate': '0.0001', 'ppl': '2.148', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '45.25', 'tokens/total': 822539776, 'tokens/trainable': 100391936, 'epoch': '1.9'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████          | 623/656 [29:33:49<1:34:43, 172.22s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍         | 624/656 [29:36:40<1:31:39, 171.86s/it]                                                                                                                                                                                                                                                  {'loss': '0.7531', 'grad_norm': '0.7393', 'learning_rate': '0.0001', 'ppl': '2.123', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '60.73', 'tokens/total': 823860736, 'tokens/trainable': 100548968, 'epoch': '1.903'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍         | 624/656 [29:36:40<1:31:39, 171.86s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 625/656 [29:39:31<1:28:43, 171.73s/it]                                                                                                                                                                                                                                                  {'loss': '0.7384', 'grad_norm': '0.7496', 'learning_rate': '0.0001', 'ppl': '2.093', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '54.49', 'tokens/total': 825181696, 'tokens/trainable': 100696752, 'epoch': '1.906'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 625/656 [29:39:31<1:28:43, 171.73s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 626/656 [29:42:20<1:25:24, 170.82s/it]                                                                                                                                                                                                                                                  {'loss': '0.7378', 'grad_norm': '0.7411', 'learning_rate': '0.0001', 'ppl': '2.091', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '39.21', 'tokens/total': 826502656, 'tokens/trainable': 100845096, 'epoch': '1.909'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 626/656 [29:42:20<1:25:24, 170.82s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 627/656 [29:45:13<1:22:54, 171.52s/it]                                                                                                                                                                                                                                                  {'loss': '0.7256', 'grad_norm': '0.7019', 'learning_rate': '0.0001', 'ppl': '2.066', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '38.81', 'tokens/total': 827823616, 'tokens/trainable': 101014496, 'epoch': '1.912'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 627/656 [29:45:13<1:22:54, 171.52s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌        | 628/656 [29:48:03<1:19:48, 171.01s/it]                                                                                                                                                                                                                                                  {'loss': '0.6842', 'grad_norm': '0.7159', 'learning_rate': '0.0001', 'ppl': '1.982', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '38.95', 'tokens/total': 829144576, 'tokens/trainable': 101184368, 'epoch': '1.915'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌        | 628/656 [29:48:03<1:19:48, 171.01s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 629/656 [29:50:53<1:16:53, 170.86s/it]                                                                                                                                                                                                                                                  {'loss': '0.7546', 'grad_norm': '0.7393', 'learning_rate': '0.0001', 'ppl': '2.127', 'memory/max_active (GiB)': '54.9', 'memory/max_allocated (GiB)': '54.9', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '46.6', 'tokens/total': 830465536, 'tokens/trainable': 101346456, 'epoch': '1.919'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 629/656 [29:50:53<1:16:53, 170.86s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 630/656 [29:53:43<1:13:54, 170.55s/it]                                                                                                                                                                                                                                                  {'loss': '0.7529', 'grad_norm': '0.6906', 'learning_rate': '0.0001', 'ppl': '2.123', 'memory/max_active (GiB)': '54.91', 'memory/max_allocated (GiB)': '54.91', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '54.8', 'tokens/total': 831786496, 'tokens/trainable': 101513656, 'epoch': '1.922'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 630/656 [29:53:43<1:13:54, 170.55s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 631/656 [29:56:34<1:11:08, 170.74s/it]                                                                                                                                                                                                                                                  {'loss': '0.738', 'grad_norm': '0.7382', 'learning_rate': '0.0001', 'ppl': '2.092', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.49', 'tokens/total': 833107456, 'tokens/trainable': 101660240, 'epoch': '1.925'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 631/656 [29:56:34<1:11:08, 170.74s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 632/656 [29:59:24<1:08:12, 170.54s/it]                                                                                                                                                                                                                                                  {'loss': '0.7129', 'grad_norm': '0.6984', 'learning_rate': '0.0001', 'ppl': '2.04', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '19.25', 'tokens/total': 834428416, 'tokens/trainable': 101823816, 'epoch': '1.928'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 632/656 [29:59:24<1:08:12, 170.54s/it] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████       | 633/656 [30:02:17<1:05:39, 171.29s/it]                                                                                                                                                                                                                                                  {'loss': '0.7792', 'grad_norm': '0.7332', 'learning_rate': '0.0001', 'ppl': '2.18', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '54.64', 'tokens/total': 835749376, 'tokens/trainable': 101986256, 'epoch': '1.931'}
 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████       | 633/656 [30:02:17<1:05:39, 171.29s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 634/656 [30:05:10<1:02:54, 171.57s/it]                                                                                                                                                                                                                                                  {'loss': '0.6846', 'grad_norm': '0.6551', 'learning_rate': '0.0001', 'ppl': '1.983', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '36.45', 'tokens/total': 837070336, 'tokens/trainable': 102173832, 'epoch': '1.934'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 634/656 [30:05:10<1:02:54, 171.57s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋      | 635/656 [30:08:01<1:00:02, 171.55s/it]                                                                                                                                                                                                                                                  {'loss': '0.7454', 'grad_norm': '0.7156', 'learning_rate': '0.0001', 'ppl': '2.107', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.9', 'tokens/total': 838391296, 'tokens/trainable': 102338208, 'epoch': '1.937'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋      | 635/656 [30:08:01<1:00:02, 171.55s/it] 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 636/656 [30:10:50<56:56, 170.81s/it]                                                                                                                                                                                                                                                  {'loss': '0.7124', 'grad_norm': '0.7078', 'learning_rate': '0.0001', 'ppl': '2.039', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '81.43', 'tokens/total': 839712256, 'tokens/trainable': 102502848, 'epoch': '1.94'}
 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 636/656 [30:10:50<56:56, 170.81s/it] 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 637/656 [30:13:39<53:54, 170.22s/it]                                                                                                                                                                                                                                                  {'loss': '0.7464', 'grad_norm': '0.7591', 'learning_rate': '0.0001', 'ppl': '2.109', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '28.82', 'tokens/total': 841033216, 'tokens/trainable': 102642984, 'epoch': '1.943'}
 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 637/656 [30:13:39<53:54, 170.22s/it] 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 638/656 [30:16:31<51:16, 170.89s/it]                                                                                                                                                                                                                                                  {'loss': '0.7217', 'grad_norm': '0.7016', 'learning_rate': '0.0001', 'ppl': '2.058', 'memory/max_active (GiB)': '54.95', 'memory/max_allocated (GiB)': '54.95', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '56.65', 'tokens/total': 842354176, 'tokens/trainable': 102809976, 'epoch': '1.946'}
 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 638/656 [30:16:31<51:16, 170.89s/it] 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 639/656 [30:19:22<48:23, 170.78s/it]                                                                                                                                                                                                                                                  {'loss': '0.808', 'grad_norm': '0.7576', 'learning_rate': '0.0001', 'ppl': '2.243', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '33.8', 'tokens/total': 843675136, 'tokens/trainable': 102967664, 'epoch': '1.949'}
 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 639/656 [30:19:22<48:23, 170.78s/it] 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 640/656 [30:22:11<45:22, 170.18s/it]                                                                                                                                                                                                                                                  {'loss': '0.8028', 'grad_norm': '0.7771', 'learning_rate': '0.0001', 'ppl': '2.232', 'memory/max_active (GiB)': '54.77', 'memory/max_allocated (GiB)': '54.77', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '58.89', 'tokens/total': 844996096, 'tokens/trainable': 103110016, 'epoch': '1.952'}
 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 640/656 [30:22:11<45:22, 170.18s/it] 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍    | 641/656 [30:25:01<42:34, 170.29s/it]                                                                                                                                                                                                                                                  {'loss': '0.7548', 'grad_norm': '0.751', 'learning_rate': '0.0001', 'ppl': '2.127', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '39.05', 'tokens/total': 846317056, 'tokens/trainable': 103261168, 'epoch': '1.955'}
 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍    | 641/656 [30:25:01<42:34, 170.29s/it] 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊    | 642/656 [30:27:50<39:39, 169.95s/it]                                                                                                                                                                                                                                                  {'loss': '0.6997', 'grad_norm': '0.685', 'learning_rate': '0.0001', 'ppl': '2.013', 'memory/max_active (GiB)': '54.84', 'memory/max_allocated (GiB)': '54.84', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '48.94', 'tokens/total': 847638016, 'tokens/trainable': 103437456, 'epoch': '1.958'}
 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊    | 642/656 [30:27:50<39:39, 169.95s/it] 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 643/656 [30:30:41<36:52, 170.17s/it]                                                                                                                                                                                                                                                  {'loss': '0.709', 'grad_norm': '0.7178', 'learning_rate': '0.0001', 'ppl': '2.032', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '31.85', 'tokens/total': 848958976, 'tokens/trainable': 103599616, 'epoch': '1.961'}
 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 643/656 [30:30:41<36:52, 170.17s/it] 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 644/656 [30:33:32<34:05, 170.49s/it]                                                                                                                                                                                                                                                  {'loss': '0.7856', 'grad_norm': '0.7153', 'learning_rate': '0.0001', 'ppl': '2.194', 'memory/max_active (GiB)': '54.86', 'memory/max_allocated (GiB)': '54.86', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.56', 'tokens/total': 850279936, 'tokens/trainable': 103763936, 'epoch': '1.964'}
 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 644/656 [30:33:32<34:05, 170.49s/it] 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 645/656 [30:36:21<31:10, 170.03s/it]                                                                                                                                                                                                                                                  {'loss': '0.7661', 'grad_norm': '0.7235', 'learning_rate': '0.0001', 'ppl': '2.151', 'memory/max_active (GiB)': '54.83', 'memory/max_allocated (GiB)': '54.83', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '30.68', 'tokens/total': 851600896, 'tokens/trainable': 103920512, 'epoch': '1.967'}
 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 645/656 [30:36:21<31:10, 170.03s/it] 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉   | 646/656 [30:39:14<28:28, 170.86s/it]                                                                                                                                                                                                                                                  {'loss': '0.7405', 'grad_norm': '0.7478', 'learning_rate': '0.0001', 'ppl': '2.097', 'memory/max_active (GiB)': '54.89', 'memory/max_allocated (GiB)': '54.89', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '70.83', 'tokens/total': 852921856, 'tokens/trainable': 104062568, 'epoch': '1.97'}
 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉   | 646/656 [30:39:14<28:28, 170.86s/it] 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎  | 647/656 [30:42:04<25:34, 170.55s/it]                                                                                                                                                                                                                                                  {'loss': '0.7373', 'grad_norm': '0.7418', 'learning_rate': '0.0001', 'ppl': '2.09', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '30.91', 'tokens/total': 854242816, 'tokens/trainable': 104213760, 'epoch': '1.973'}
 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎  | 647/656 [30:42:04<25:34, 170.55s/it] 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 648/656 [30:44:55<22:44, 170.62s/it]                                                                                                                                                                                                                                                  {'loss': '0.8027', 'grad_norm': '0.7315', 'learning_rate': '0.0001', 'ppl': '2.231', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '53.95', 'tokens/total': 855563776, 'tokens/trainable': 104373832, 'epoch': '1.977'}
 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 648/656 [30:44:55<22:44, 170.62s/it] 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 649/656 [30:47:46<19:54, 170.70s/it]                                                                                                                                                                                                                                                  {'loss': '0.7268', 'grad_norm': '0.701', 'learning_rate': '0.0001', 'ppl': '2.069', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '50.77', 'tokens/total': 856884736, 'tokens/trainable': 104536016, 'epoch': '1.98'}
 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 649/656 [30:47:46<19:54, 170.70s/it] 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 650/656 [30:50:36<17:04, 170.70s/it]                                                                                                                                                                                                                                                  {'loss': '0.696', 'grad_norm': '0.6483', 'learning_rate': '0.0001', 'ppl': '2.006', 'memory/max_active (GiB)': '54.79', 'memory/max_allocated (GiB)': '54.79', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '60.21', 'tokens/total': 858205696, 'tokens/trainable': 104732304, 'epoch': '1.983'}
 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 650/656 [30:50:36<17:04, 170.70s/it] 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 651/656 [30:53:29<14:15, 171.15s/it]                                                                                                                                                                                                                                                  {'loss': '0.7282', 'grad_norm': '0.7064', 'learning_rate': '0.0001', 'ppl': '2.071', 'memory/max_active (GiB)': '54.82', 'memory/max_allocated (GiB)': '54.82', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '38.26', 'tokens/total': 859526656, 'tokens/trainable': 104898920, 'epoch': '1.986'}
 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 651/656 [30:53:29<14:15, 171.15s/it] 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 652/656 [30:56:21<11:25, 171.48s/it]                                                                                                                                                                                                                                                  {'loss': '0.7788', 'grad_norm': '0.6852', 'learning_rate': '0.0001', 'ppl': '2.179', 'memory/max_active (GiB)': '54.85', 'memory/max_allocated (GiB)': '54.85', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '64.36', 'tokens/total': 860847616, 'tokens/trainable': 105074776, 'epoch': '1.989'}
 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 652/656 [30:56:21<11:25, 171.48s/it]100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 653/656 [30:59:10<08:32, 170.71s/it]                                                                                                                                                                                                                                                     {'loss': '0.7445', 'grad_norm': '0.7562', 'learning_rate': '0.0001', 'ppl': '2.105', 'memory/max_active (GiB)': '54.8', 'memory/max_allocated (GiB)': '54.8', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '40.48', 'tokens/total': 862168576, 'tokens/trainable': 105224048, 'epoch': '1.992'}
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 653/656 [30:59:10<08:32, 170.71s/it]100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍| 654/656 [31:01:59<05:40, 170.16s/it]                                                                                                                                                                                                                                                     {'loss': '0.7566', 'grad_norm': '0.7531', 'learning_rate': '0.0001', 'ppl': '2.131', 'memory/max_active (GiB)': '54.81', 'memory/max_allocated (GiB)': '54.81', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '39.73', 'tokens/total': 863489536, 'tokens/trainable': 105376976, 'epoch': '1.995'}
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍| 654/656 [31:01:59<05:40, 170.16s/it]100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 655/656 [31:04:49<02:50, 170.24s/it]                                                                                                                                                                                                                                                     {'loss': '0.7597', 'grad_norm': '0.7604', 'learning_rate': '0.0001', 'ppl': '2.138', 'memory/max_active (GiB)': '54.87', 'memory/max_allocated (GiB)': '54.87', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '33.31', 'tokens/total': 864810496, 'tokens/trainable': 105525424, 'epoch': '1.998'}
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 655/656 [31:04:49<02:50, 170.24s/it]100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 656/656 [31:06:47<00:00, 154.66s/it]                                                                                                                                                                                                                                                     {'loss': '0.7709', 'grad_norm': '0.8657', 'learning_rate': '0.0001', 'ppl': '2.162', 'memory/max_active (GiB)': '54.88', 'memory/max_allocated (GiB)': '54.88', 'memory/device_reserved (GiB)': '62.07', 'tokens/train_per_sec_per_gpu': '51.19', 'tokens/total': 865713152, 'tokens/trainable': 105633600, 'epoch': '2'}
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 656/656 [31:06:47<00:00, 154.66s/it][2026-02-13 10:46:55,737] [INFO] [axolotl.core.trainers.base._save:721] [PID:9815] Saving model checkpoint to ./finetune-model-output/checkpoint-656
                                                                                                                                                                                                                                                     {'train_runtime': '1.121e+05', 'train_samples_per_second': '0.702', 'train_steps_per_second': '0.006', 'train_loss': '0.8211', 'memory/max_active (GiB)': '29.61', 'memory/max_allocated (GiB)': '29.61', 'memory/device_reserved (GiB)': '62.07', 'epoch': '2', 'tokens/train_per_sec_per_gpu': '0'}
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 656/656 [31:08:17<00:00, 154.66s/it]100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 656/656 [31:08:17<00:00, 170.88s/it]
[2026-02-13 10:49:16,120] [INFO] [axolotl.train.save_trained_model:226] [PID:9815] Training completed! Saving trained model to ./finetune-model-output.
[2026-02-13 10:49:35,981] [INFO] [axolotl.train.save_trained_model:340] [PID:9815] Model successfully saved to ./finetune-model-output
[2026-02-13 10:49:36,655] [INFO] [axolotl.core.trainers.base._save:721] [PID:9815] Saving model checkpoint to ./finetune-model-output
Processing Files (0 / 0)      : |                                                                                                                                                                                       |  0.00B /  0.00B            
New Data Upload               : |                                                                                                                                                                                       |  0.00B /  0.00B            [A

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:   5%|████████▉                                                                                                                                                                          | 67.1MB / 1.34GB            [A[A[A

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:   5%|████████▉                                                                                                                                                                          | 67.1MB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :   5%|████████▉                                                                                                                                                                          | 67.1MB / 1.34GB,   ???B/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  10%|█████████████████▉                                                                                                                                                                 |  134MB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :  10%|█████████████████▉                                                                                                                                                                 |  134MB / 1.34GB,  335MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  16%|███████████████████████████▉                                                                                                                                                       |  210MB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :  16%|███████████████████████████▉                                                                                                                                                       |  210MB / 1.34GB,  356MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  20%|███████████████████████████████████▊                                                                                                                                               |  268MB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :  20%|███████████████████████████████████▊                                                                                                                                               |  268MB / 1.34GB,  335MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  24%|███████████████████████████████████████████▋                                                                                                                                       |  327MB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :  24%|███████████████████████████████████████████▋                                                                                                                                       |  327MB / 1.34GB,  325MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  28%|██████████████████████████████████████████████████▎                                                                                                                                |  377MB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :  28%|██████████████████████████████████████████████████▎                                                                                                                                |  377MB / 1.34GB,  310MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  32%|█████████████████████████████████████████████████████████                                                                                                                          |  428MB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :  32%|█████████████████████████████████████████████████████████                                                                                                                          |  428MB / 1.34GB,  301MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  36%|███████████████████████████████████████████████████████████████▊                                                                                                                   |  478MB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :  36%|███████████████████████████████████████████████████████████████▊                                                                                                                   |  478MB / 1.34GB,  294MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  39%|██████████████████████████████████████████████████████████████████████▍                                                                                                            |  528MB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :  39%|██████████████████████████████████████████████████████████████████████▍                                                                                                            |  528MB / 1.34GB,  288MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  44%|██████████████████████████████████████████████████████████████████████████████▎                                                                                                    |  587MB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :  44%|██████████████████████████████████████████████████████████████████████████████▎                                                                                                    |  587MB / 1.34GB,  289MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  47%|█████████████████████████████████████████████████████████████████████████████████████                                                                                              |  638MB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :  47%|█████████████████████████████████████████████████████████████████████████████████████                                                                                              |  638MB / 1.34GB,  285MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  52%|████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                      |  696MB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :  52%|████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                      |  696MB / 1.34GB,  286MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  56%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                               |  747MB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :  56%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                               |  747MB / 1.34GB,  283MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                        |  797MB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :  59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                        |  797MB / 1.34GB,  281MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                 |  856MB / 1.34GB            [A[A[A

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                        |  923MB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :  69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                        |  923MB / 1.34GB,  285MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                 |  973MB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :  72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                 |  973MB / 1.34GB,  283MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  77%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                         | 1.03GB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :  77%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                         | 1.03GB / 1.34GB,  284MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                  | 1.08GB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :  81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                  | 1.08GB / 1.34GB,  282MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                            | 1.13GB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :  84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                            | 1.13GB / 1.34GB,  280MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                    | 1.19GB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :  89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                    | 1.19GB / 1.34GB,  281MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌             | 1.24GB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌             | 1.24GB / 1.34GB,  280MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors:  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 1.29GB / 1.34GB            [A[A[AProcessing Files (1 / 2)      :  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 1.29GB / 1.34GB,  278MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.34GB / 1.34GB            [A[A[AProcessing Files (2 / 2)      : 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.34GB / 1.34GB,  277MB/s  

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.34GB / 1.34GB            [A[A[A

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.34GB / 1.34GB            [A[A[A

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.34GB / 1.34GB            [A[A[A

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.34GB / 1.34GB            [A[A[A

  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            [A[A


  ...adapter_model.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.34GB / 1.34GB            [A[A[AProcessing Files (2 / 2)      : 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.34GB / 1.34GB,  236MB/s  
New Data Upload               : |                                                                                                                                                                                       |  0.00B /  0.00B,  0.00B/s  
  ...-output/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.99kB / 6.99kB            
  ...adapter_model.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.34GB / 1.34GB