[2026-02-04 03:22:54,042] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:23602] bf16 support detected, enabling for this configuration.
[2026-02-04 03:22:54,243] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:23602] baseline 0.000GB ()
[2026-02-04 03:22:54,245] [INFO] [axolotl.cli.config.load_cfg:259] [PID:23602] config:
{
  "activation_offloading": false,
  "axolotl_config_path": "T2J_SFT_4B.yaml",
  "base_model": "Qwen/Qwen2.5-1.5B-Instruct",
  "base_model_config": "Qwen/Qwen2.5-1.5B-Instruct",
  "batch_size": 256,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_90",
    "fp8": true,
    "n_gpu": 1,
    "n_node": 1
  },
  "context_parallel_size": 1,
  "dataloader_num_workers": 1,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_num_proc": 192,
  "dataset_prepared_path": "preprocess",
  "datasets": [
    {
      "chat_template": "tokenizer_default",
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "train_T2J.jsonl",
      "trust_remote_code": false,
      "type": "chat_template"
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "eaft_alpha": 1.0,
  "eaft_k": 20,
  "env_capabilities": {
    "torch_version": "2.9.1"
  },
  "eval_batch_size": 2,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_sample_packing": false,
  "eval_table_size": 0,
  "evals_per_epoch": 0,
  "experimental_skip_move_to_device": true,
  "flash_attention": true,
  "fp16": false,
  "gradient_accumulation_steps": 128,
  "hub_model_id": "amphora/FC-T2J-SFT-1_5B",
  "include_tkps": true,
  "learning_rate": 2e-05,
  "liger_fused_linear_cross_entropy": true,
  "liger_glu_activation": true,
  "liger_rms_norm": true,
  "liger_rope": true,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 1,
  "lora_dropout": 0.0,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "mean_resizing_embeddings": false,
  "micro_batch_size": 2,
  "model_config_type": "qwen2",
  "num_epochs": 3.0,
  "optimizer": "adamw_torch_fused",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "./outputs",
  "pad_to_sequence_len": false,
  "plugins": [
    "axolotl.integrations.liger.LigerPlugin"
  ],
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": false,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_steps": 0.3333333333333333,
  "saves_per_epoch": 1,
  "sequence_len": 16384,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": false,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "Qwen/Qwen2.5-1.5B-Instruct",
  "tokenizer_save_jinja_files": true,
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "scale_rewards": true,
    "sync_ref_model": false,
    "use_vllm": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "use_otel_metrics": false,
  "use_ray": false,
  "use_wandb": true,
  "val_set_size": 0.01,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "wandb_project": "FC-T2J",
  "warmup_ratio": 0.05,
  "weight_decay": 0.01,
  "world_size": 1
}
[2026-02-04 03:22:55,706] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:23602] EOS: 151645 / <|im_end|>
[2026-02-04 03:22:55,707] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:23602] BOS: None / None
[2026-02-04 03:22:55,707] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:23602] PAD: 151643 / <|endoftext|>
[2026-02-04 03:22:55,707] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:23602] UNK: None / None
[2026-02-04 03:22:55,709] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:23602] Loading prepared dataset from disk at preprocess/d897fb18203056becd1259947bafc7e4...
Loading dataset from disk:   0%|                                                                        | 0/192 [00:00<?, ?it/s]Loading dataset from disk: 100%|███████████████████████████████████████████████████████████| 192/192 [00:00<00:00, 10992.59it/s]
[2026-02-04 03:22:58,890] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:417] [PID:23602] total_num_tokens: 838_886_301
[2026-02-04 03:23:10,146] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:435] [PID:23602] `total_supervised_tokens: 409_623_431`
[2026-02-04 03:23:10,146] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:533] [PID:23602] total_num_steps: 1751
[2026-02-04 03:23:10,146] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:23602] Maximum number of steps set at 1751
[2026-02-04 03:23:10,241] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:23602] loading tokenizer... Qwen/Qwen2.5-1.5B-Instruct
[2026-02-04 03:23:11,868] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:285] [PID:23602] EOS: 151645 / <|im_end|>
[2026-02-04 03:23:11,868] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:286] [PID:23602] BOS: None / None
[2026-02-04 03:23:11,868] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:287] [PID:23602] PAD: 151643 / <|endoftext|>
[2026-02-04 03:23:11,869] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:288] [PID:23602] UNK: None / None
[2026-02-04 03:23:11,869] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:23602] Loading model
[2026-02-04 03:23:12,011] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:23602] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-02-04 03:23:12,016] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:23602] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-02-04 03:23:12,201] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:98] [PID:23602] Applying LIGER to qwen2 with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': True, 'rms_norm': True, 'swiglu': True}
Loading weights:   0%|                                                                                  | 0/338 [00:00<?, ?it/s]Loading weights:   0%|                         | 1/338 [00:00<00:00, 5133.79it/s, Materializing param=model.embed_tokens.weight]Loading weights:   0%|                         | 1/338 [00:00<00:00, 2549.73it/s, Materializing param=model.embed_tokens.weight]Loading weights:   1%|             | 2/338 [00:00<00:00, 1604.25it/s, Materializing param=model.layers.0.input_layernorm.weight]Loading weights:   1%|             | 2/338 [00:00<00:00, 1410.56it/s, Materializing param=model.layers.0.input_layernorm.weight]Loading weights:   1%|▏              | 3/338 [00:00<00:00, 1739.41it/s, Materializing param=model.layers.0.mlp.down_proj.weight]Loading weights:   1%|▏              | 3/338 [00:00<00:00, 1377.74it/s, Materializing param=model.layers.0.mlp.down_proj.weight]Loading weights:   1%|▏              | 4/338 [00:00<00:00, 1629.65it/s, Materializing param=model.layers.0.mlp.gate_proj.weight]Loading weights:   1%|▏              | 4/338 [00:00<00:00, 1539.76it/s, Materializing param=model.layers.0.mlp.gate_proj.weight]Loading weights:   1%|▎                | 5/338 [00:00<00:00, 1752.74it/s, Materializing param=model.layers.0.mlp.up_proj.weight]Loading weights:   1%|▎                | 5/338 [00:00<00:00, 1671.70it/s, Materializing param=model.layers.0.mlp.up_proj.weight]Loading weights:   2%|    | 6/338 [00:00<00:00, 1841.63it/s, Materializing param=model.layers.0.post_attention_layernorm.weight]Loading weights:   2%|    | 6/338 [00:00<00:00, 1767.88it/s, Materializing param=model.layers.0.post_attention_layernorm.weight]Loading weights:   2%|▎             | 7/338 [00:00<00:00, 1939.11it/s, Materializing param=model.layers.0.self_attn.k_proj.bias]Loading weights:   2%|▎             | 7/338 [00:00<00:00, 1869.48it/s, Materializing param=model.layers.0.self_attn.k_proj.bias]Loading weights:   2%|▎           | 8/338 [00:00<00:00, 1552.01it/s, Materializing param=model.layers.0.self_attn.k_proj.weight]Loading weights:   2%|▎           | 8/338 [00:00<00:00, 1511.46it/s, Materializing param=model.layers.0.self_attn.k_proj.weight]Loading weights:   3%|▎           | 9/338 [00:00<00:00, 1457.59it/s, Materializing param=model.layers.0.self_attn.o_proj.weight]Loading weights:   3%|▎           | 9/338 [00:00<00:00, 1424.75it/s, Materializing param=model.layers.0.self_attn.o_proj.weight]Loading weights:   3%|▍            | 10/338 [00:00<00:00, 1529.15it/s, Materializing param=model.layers.0.self_attn.q_proj.bias]Loading weights:   3%|▍            | 10/338 [00:00<00:00, 1498.82it/s, Materializing param=model.layers.0.self_attn.q_proj.bias]Loading weights:   3%|▎          | 11/338 [00:00<00:00, 1597.50it/s, Materializing param=model.layers.0.self_attn.q_proj.weight]Loading weights:   3%|▎          | 11/338 [00:00<00:00, 1567.43it/s, Materializing param=model.layers.0.self_attn.q_proj.weight]Loading weights:   4%|▍            | 12/338 [00:00<00:00, 1648.22it/s, Materializing param=model.layers.0.self_attn.v_proj.bias]Loading weights:   4%|▍            | 12/338 [00:00<00:00, 1610.10it/s, Materializing param=model.layers.0.self_attn.v_proj.bias]Loading weights:   4%|▍          | 13/338 [00:00<00:00, 1555.13it/s, Materializing param=model.layers.0.self_attn.v_proj.weight]Loading weights:   4%|▍          | 13/338 [00:00<00:00, 1520.82it/s, Materializing param=model.layers.0.self_attn.v_proj.weight]Loading weights:   4%|▍           | 14/338 [00:00<00:00, 1500.91it/s, Materializing param=model.layers.1.input_layernorm.weight]Loading weights:   4%|▍           | 14/338 [00:00<00:00, 1471.87it/s, Materializing param=model.layers.1.input_layernorm.weight]Loading weights:   4%|▌             | 15/338 [00:00<00:00, 1527.91it/s, Materializing param=model.layers.1.mlp.down_proj.weight]Loading weights:   4%|▌             | 15/338 [00:00<00:00, 1497.79it/s, Materializing param=model.layers.1.mlp.down_proj.weight]Loading weights:   5%|▋             | 16/338 [00:00<00:00, 1551.90it/s, Materializing param=model.layers.1.mlp.gate_proj.weight]Loading weights:   5%|▋             | 16/338 [00:00<00:00, 1524.06it/s, Materializing param=model.layers.1.mlp.gate_proj.weight]Loading weights:   5%|▊               | 17/338 [00:00<00:00, 1573.40it/s, Materializing param=model.layers.1.mlp.up_proj.weight]Loading weights:   5%|▊               | 17/338 [00:00<00:00, 1547.31it/s, Materializing param=model.layers.1.mlp.up_proj.weight]Loading weights:   5%|▏  | 18/338 [00:00<00:00, 1581.86it/s, Materializing param=model.layers.1.post_attention_layernorm.weight]Loading weights:   5%|▏  | 18/338 [00:00<00:00, 1517.35it/s, Materializing param=model.layers.1.post_attention_layernorm.weight]Loading weights:   6%|▋            | 19/338 [00:00<00:00, 1550.42it/s, Materializing param=model.layers.1.self_attn.k_proj.bias]Loading weights:   6%|▋            | 19/338 [00:00<00:00, 1527.83it/s, Materializing param=model.layers.1.self_attn.k_proj.bias]Loading weights:   6%|▋          | 20/338 [00:00<00:00, 1569.58it/s, Materializing param=model.layers.1.self_attn.k_proj.weight]Loading weights:   6%|▋          | 20/338 [00:00<00:00, 1547.77it/s, Materializing param=model.layers.1.self_attn.k_proj.weight]Loading weights:   6%|▋          | 21/338 [00:00<00:00, 1584.24it/s, Materializing param=model.layers.1.self_attn.o_proj.weight]Loading weights:   6%|▋          | 21/338 [00:00<00:00, 1509.70it/s, Materializing param=model.layers.1.self_attn.o_proj.weight]Loading weights:   7%|▊            | 22/338 [00:00<00:00, 1546.03it/s, Materializing param=model.layers.1.self_attn.q_proj.bias]Loading weights:   7%|▊            | 22/338 [00:00<00:00, 1526.74it/s, Materializing param=model.layers.1.self_attn.q_proj.bias]Loading weights:   7%|▋          | 23/338 [00:00<00:00, 1550.20it/s, Materializing param=model.layers.1.self_attn.q_proj.weight]Loading weights:   7%|▋          | 23/338 [00:00<00:00, 1531.89it/s, Materializing param=model.layers.1.self_attn.q_proj.weight]Loading weights:   7%|▉            | 24/338 [00:00<00:00, 1542.73it/s, Materializing param=model.layers.1.self_attn.v_proj.bias]Loading weights:   7%|▉            | 24/338 [00:00<00:00, 1524.99it/s, Materializing param=model.layers.1.self_attn.v_proj.bias]Loading weights:   7%|▊          | 25/338 [00:00<00:00, 1555.80it/s, Materializing param=model.layers.1.self_attn.v_proj.weight]Loading weights:   7%|▊          | 25/338 [00:00<00:00, 1538.43it/s, Materializing param=model.layers.1.self_attn.v_proj.weight]Loading weights:   8%|▉           | 26/338 [00:00<00:00, 1495.99it/s, Materializing param=model.layers.2.input_layernorm.weight]Loading weights:   8%|▉           | 26/338 [00:00<00:00, 1478.91it/s, Materializing param=model.layers.2.input_layernorm.weight]Loading weights:   8%|█             | 27/338 [00:00<00:00, 1505.83it/s, Materializing param=model.layers.2.mlp.down_proj.weight]Loading weights:   8%|█             | 27/338 [00:00<00:00, 1488.79it/s, Materializing param=model.layers.2.mlp.down_proj.weight]Loading weights:   8%|█▏            | 28/338 [00:00<00:00, 1515.77it/s, Materializing param=model.layers.2.mlp.gate_proj.weight]Loading weights:   8%|█▏            | 28/338 [00:00<00:00, 1499.63it/s, Materializing param=model.layers.2.mlp.gate_proj.weight]Loading weights:   9%|█▎              | 29/338 [00:00<00:00, 1468.72it/s, Materializing param=model.layers.2.mlp.up_proj.weight]Loading weights:   9%|█▎              | 29/338 [00:00<00:00, 1453.38it/s, Materializing param=model.layers.2.mlp.up_proj.weight]Loading weights:   9%|▎  | 30/338 [00:00<00:00, 1478.99it/s, Materializing param=model.layers.2.post_attention_layernorm.weight]Loading weights:   9%|▎  | 30/338 [00:00<00:00, 1464.41it/s, Materializing param=model.layers.2.post_attention_layernorm.weight]Loading weights:   9%|█▏           | 31/338 [00:00<00:00, 1453.98it/s, Materializing param=model.layers.2.self_attn.k_proj.bias]Loading weights:   9%|█▏           | 31/338 [00:00<00:00, 1440.51it/s, Materializing param=model.layers.2.self_attn.k_proj.bias]Loading weights:   9%|█          | 32/338 [00:00<00:00, 1454.00it/s, Materializing param=model.layers.2.self_attn.k_proj.weight]Loading weights:   9%|█          | 32/338 [00:00<00:00, 1440.40it/s, Materializing param=model.layers.2.self_attn.k_proj.weight]Loading weights:  10%|█          | 33/338 [00:00<00:00, 1445.19it/s, Materializing param=model.layers.2.self_attn.o_proj.weight]Loading weights:  10%|█          | 33/338 [00:00<00:00, 1432.97it/s, Materializing param=model.layers.2.self_attn.o_proj.weight]Loading weights:  10%|█▎           | 34/338 [00:00<00:00, 1446.75it/s, Materializing param=model.layers.2.self_attn.q_proj.bias]Loading weights:  10%|█▎           | 34/338 [00:00<00:00, 1434.80it/s, Materializing param=model.layers.2.self_attn.q_proj.bias]Loading weights:  10%|█▏         | 35/338 [00:00<00:00, 1456.82it/s, Materializing param=model.layers.2.self_attn.q_proj.weight]Loading weights:  10%|█▏         | 35/338 [00:00<00:00, 1444.69it/s, Materializing param=model.layers.2.self_attn.q_proj.weight]Loading weights:  11%|█▍           | 36/338 [00:00<00:00, 1450.49it/s, Materializing param=model.layers.2.self_attn.v_proj.bias]Loading weights:  11%|█▍           | 36/338 [00:00<00:00, 1439.43it/s, Materializing param=model.layers.2.self_attn.v_proj.bias]Loading weights:  11%|█▏         | 37/338 [00:00<00:00, 1425.62it/s, Materializing param=model.layers.2.self_attn.v_proj.weight]Loading weights:  11%|█▏         | 37/338 [00:00<00:00, 1414.55it/s, Materializing param=model.layers.2.self_attn.v_proj.weight]Loading weights:  11%|█▎          | 38/338 [00:00<00:00, 1410.05it/s, Materializing param=model.layers.3.input_layernorm.weight]Loading weights:  11%|█▎          | 38/338 [00:00<00:00, 1399.96it/s, Materializing param=model.layers.3.input_layernorm.weight]Loading weights:  12%|█▌            | 39/338 [00:00<00:00, 1420.62it/s, Materializing param=model.layers.3.mlp.down_proj.weight]Loading weights:  12%|█▌            | 39/338 [00:00<00:00, 1410.41it/s, Materializing param=model.layers.3.mlp.down_proj.weight]Loading weights:  12%|█▋            | 40/338 [00:00<00:00, 1414.16it/s, Materializing param=model.layers.3.mlp.gate_proj.weight]Loading weights:  12%|█▋            | 40/338 [00:00<00:00, 1401.99it/s, Materializing param=model.layers.3.mlp.gate_proj.weight]Loading weights:  12%|█▉              | 41/338 [00:00<00:00, 1389.19it/s, Materializing param=model.layers.3.mlp.up_proj.weight]Loading weights:  12%|█▉              | 41/338 [00:00<00:00, 1366.98it/s, Materializing param=model.layers.3.mlp.up_proj.weight]Loading weights:  12%|▎  | 42/338 [00:00<00:00, 1364.62it/s, Materializing param=model.layers.3.post_attention_layernorm.weight]Loading weights:  12%|▎  | 42/338 [00:00<00:00, 1331.02it/s, Materializing param=model.layers.3.post_attention_layernorm.weight]Loading weights:  13%|█▋           | 43/338 [00:00<00:00, 1332.79it/s, Materializing param=model.layers.3.self_attn.k_proj.bias]Loading weights:  13%|█▋           | 43/338 [00:00<00:00, 1312.60it/s, Materializing param=model.layers.3.self_attn.k_proj.bias]Loading weights:  13%|█▍         | 44/338 [00:00<00:00, 1270.73it/s, Materializing param=model.layers.3.self_attn.k_proj.weight]Loading weights:  13%|█▍         | 44/338 [00:00<00:00, 1257.05it/s, Materializing param=model.layers.3.self_attn.k_proj.weight]Loading weights:  13%|█▍         | 45/338 [00:00<00:00, 1264.94it/s, Materializing param=model.layers.3.self_attn.o_proj.weight]Loading weights:  13%|█▍         | 45/338 [00:00<00:00, 1246.89it/s, Materializing param=model.layers.3.self_attn.o_proj.weight]Loading weights:  14%|█▊           | 46/338 [00:00<00:00, 1254.13it/s, Materializing param=model.layers.3.self_attn.q_proj.bias]Loading weights:  14%|█▊           | 46/338 [00:00<00:00, 1237.75it/s, Materializing param=model.layers.3.self_attn.q_proj.bias]Loading weights:  14%|█▌         | 47/338 [00:00<00:00, 1217.38it/s, Materializing param=model.layers.3.self_attn.q_proj.weight]Loading weights:  14%|█▌         | 47/338 [00:00<00:00, 1205.67it/s, Materializing param=model.layers.3.self_attn.q_proj.weight]Loading weights:  14%|█▊           | 48/338 [00:00<00:00, 1214.08it/s, Materializing param=model.layers.3.self_attn.v_proj.bias]Loading weights:  14%|█▊           | 48/338 [00:00<00:00, 1192.99it/s, Materializing param=model.layers.3.self_attn.v_proj.bias]Loading weights:  14%|█▌         | 49/338 [00:00<00:00, 1191.18it/s, Materializing param=model.layers.3.self_attn.v_proj.weight]Loading weights:  14%|█▌         | 49/338 [00:00<00:00, 1173.59it/s, Materializing param=model.layers.3.self_attn.v_proj.weight]Loading weights:  15%|█▊          | 50/338 [00:00<00:00, 1181.25it/s, Materializing param=model.layers.4.input_layernorm.weight]Loading weights:  15%|█▊          | 50/338 [00:00<00:00, 1174.24it/s, Materializing param=model.layers.4.input_layernorm.weight]Loading weights:  15%|██            | 51/338 [00:00<00:00, 1186.53it/s, Materializing param=model.layers.4.mlp.down_proj.weight]Loading weights:  15%|██            | 51/338 [00:00<00:00, 1179.73it/s, Materializing param=model.layers.4.mlp.down_proj.weight]Loading weights:  15%|██▏           | 52/338 [00:00<00:00, 1187.17it/s, Materializing param=model.layers.4.mlp.gate_proj.weight]Loading weights:  15%|██▏           | 52/338 [00:00<00:00, 1149.13it/s, Materializing param=model.layers.4.mlp.gate_proj.weight]Loading weights:  16%|██▌             | 53/338 [00:00<00:00, 1160.55it/s, Materializing param=model.layers.4.mlp.up_proj.weight]Loading weights:  16%|██▌             | 53/338 [00:00<00:00, 1136.81it/s, Materializing param=model.layers.4.mlp.up_proj.weight]Loading weights:  16%|▍  | 54/338 [00:00<00:00, 1147.83it/s, Materializing param=model.layers.4.post_attention_layernorm.weight]Loading weights:  16%|▍  | 54/338 [00:00<00:00, 1140.38it/s, Materializing param=model.layers.4.post_attention_layernorm.weight]Loading weights:  16%|██           | 55/338 [00:00<00:00, 1145.18it/s, Materializing param=model.layers.4.self_attn.k_proj.bias]Loading weights:  16%|██           | 55/338 [00:00<00:00, 1139.50it/s, Materializing param=model.layers.4.self_attn.k_proj.bias]Loading weights:  17%|█▊         | 56/338 [00:00<00:00, 1150.62it/s, Materializing param=model.layers.4.self_attn.k_proj.weight]Loading weights:  17%|█▊         | 56/338 [00:00<00:00, 1144.93it/s, Materializing param=model.layers.4.self_attn.k_proj.weight]Loading weights:  17%|█▊         | 57/338 [00:00<00:00, 1140.68it/s, Materializing param=model.layers.4.self_attn.o_proj.weight]Loading weights:  17%|█▊         | 57/338 [00:00<00:00, 1134.94it/s, Materializing param=model.layers.4.self_attn.o_proj.weight]Loading weights:  17%|██▏          | 58/338 [00:00<00:00, 1145.49it/s, Materializing param=model.layers.4.self_attn.q_proj.bias]Loading weights:  17%|██▏          | 58/338 [00:00<00:00, 1131.71it/s, Materializing param=model.layers.4.self_attn.q_proj.bias]Loading weights:  17%|█▉         | 59/338 [00:00<00:00, 1136.73it/s, Materializing param=model.layers.4.self_attn.q_proj.weight]Loading weights:  17%|█▉         | 59/338 [00:00<00:00, 1126.34it/s, Materializing param=model.layers.4.self_attn.q_proj.weight]Loading weights:  18%|██▎          | 60/338 [00:00<00:00, 1136.46it/s, Materializing param=model.layers.4.self_attn.v_proj.bias]Loading weights:  18%|██▎          | 60/338 [00:00<00:00, 1128.00it/s, Materializing param=model.layers.4.self_attn.v_proj.bias]Loading weights:  18%|█▉         | 61/338 [00:00<00:00, 1135.68it/s, Materializing param=model.layers.4.self_attn.v_proj.weight]Loading weights:  18%|█▉         | 61/338 [00:00<00:00, 1127.26it/s, Materializing param=model.layers.4.self_attn.v_proj.weight]Loading weights:  18%|██▏         | 62/338 [00:00<00:00, 1118.59it/s, Materializing param=model.layers.5.input_layernorm.weight]Loading weights:  18%|██▏         | 62/338 [00:00<00:00, 1110.16it/s, Materializing param=model.layers.5.input_layernorm.weight]Loading weights:  19%|██▌           | 63/338 [00:00<00:00, 1120.11it/s, Materializing param=model.layers.5.mlp.down_proj.weight]Loading weights:  19%|██▌           | 63/338 [00:00<00:00, 1106.96it/s, Materializing param=model.layers.5.mlp.down_proj.weight]Loading weights:  19%|██▋           | 64/338 [00:00<00:00, 1113.54it/s, Materializing param=model.layers.5.mlp.gate_proj.weight]Loading weights:  19%|██▋           | 64/338 [00:00<00:00, 1108.74it/s, Materializing param=model.layers.5.mlp.gate_proj.weight]Loading weights:  19%|███             | 65/338 [00:00<00:00, 1107.95it/s, Materializing param=model.layers.5.mlp.up_proj.weight]Loading weights:  19%|███             | 65/338 [00:00<00:00, 1100.80it/s, Materializing param=model.layers.5.mlp.up_proj.weight]Loading weights:  20%|▌  | 66/338 [00:00<00:00, 1104.50it/s, Materializing param=model.layers.5.post_attention_layernorm.weight]Loading weights:  20%|▌  | 66/338 [00:00<00:00, 1093.06it/s, Materializing param=model.layers.5.post_attention_layernorm.weight]Loading weights:  20%|██▌          | 67/338 [00:00<00:00, 1102.23it/s, Materializing param=model.layers.5.self_attn.k_proj.bias]Loading weights:  20%|██▌          | 67/338 [00:00<00:00, 1093.84it/s, Materializing param=model.layers.5.self_attn.k_proj.bias]Loading weights:  20%|██▏        | 68/338 [00:00<00:00, 1088.86it/s, Materializing param=model.layers.5.self_attn.k_proj.weight]Loading weights:  20%|██▏        | 68/338 [00:00<00:00, 1072.65it/s, Materializing param=model.layers.5.self_attn.k_proj.weight]Loading weights:  20%|██▏        | 69/338 [00:00<00:00, 1072.72it/s, Materializing param=model.layers.5.self_attn.o_proj.weight]Loading weights:  20%|██▏        | 69/338 [00:00<00:00, 1059.83it/s, Materializing param=model.layers.5.self_attn.o_proj.weight]Loading weights:  21%|██▋          | 70/338 [00:00<00:00, 1067.99it/s, Materializing param=model.layers.5.self_attn.q_proj.bias]Loading weights:  21%|██▋          | 70/338 [00:00<00:00, 1058.85it/s, Materializing param=model.layers.5.self_attn.q_proj.bias]Loading weights:  21%|██▎        | 71/338 [00:00<00:00, 1066.33it/s, Materializing param=model.layers.5.self_attn.q_proj.weight]Loading weights:  21%|██▎        | 71/338 [00:00<00:00, 1062.26it/s, Materializing param=model.layers.5.self_attn.q_proj.weight]Loading weights:  21%|██▊          | 72/338 [00:00<00:00, 1070.72it/s, Materializing param=model.layers.5.self_attn.v_proj.bias]Loading weights:  21%|██▊          | 72/338 [00:00<00:00, 1066.85it/s, Materializing param=model.layers.5.self_attn.v_proj.bias]Loading weights:  22%|██▍        | 73/338 [00:00<00:00, 1073.90it/s, Materializing param=model.layers.5.self_attn.v_proj.weight]Loading weights:  22%|██▍        | 73/338 [00:00<00:00, 1067.96it/s, Materializing param=model.layers.5.self_attn.v_proj.weight]Loading weights:  22%|██▋         | 74/338 [00:00<00:00, 1070.24it/s, Materializing param=model.layers.6.input_layernorm.weight]Loading weights:  22%|██▋         | 74/338 [00:00<00:00, 1066.29it/s, Materializing param=model.layers.6.input_layernorm.weight]Loading weights:  22%|███           | 75/338 [00:00<00:00, 1049.86it/s, Materializing param=model.layers.6.mlp.down_proj.weight]Loading weights:  22%|███           | 75/338 [00:00<00:00, 1039.70it/s, Materializing param=model.layers.6.mlp.down_proj.weight]Loading weights:  22%|███▏          | 76/338 [00:00<00:00, 1047.53it/s, Materializing param=model.layers.6.mlp.gate_proj.weight]Loading weights:  22%|███▏          | 76/338 [00:00<00:00, 1039.61it/s, Materializing param=model.layers.6.mlp.gate_proj.weight]Loading weights:  23%|███▋            | 77/338 [00:00<00:00, 1044.59it/s, Materializing param=model.layers.6.mlp.up_proj.weight]Loading weights:  23%|███▋            | 77/338 [00:00<00:00, 1039.32it/s, Materializing param=model.layers.6.mlp.up_proj.weight]Loading weights:  23%|▋  | 78/338 [00:00<00:00, 1033.83it/s, Materializing param=model.layers.6.post_attention_layernorm.weight]Loading weights:  23%|▋  | 78/338 [00:00<00:00, 1018.36it/s, Materializing param=model.layers.6.post_attention_layernorm.weight]Loading weights:  23%|███          | 79/338 [00:00<00:00, 1025.65it/s, Materializing param=model.layers.6.self_attn.k_proj.bias]Loading weights:  23%|███          | 79/338 [00:00<00:00, 1022.27it/s, Materializing param=model.layers.6.self_attn.k_proj.bias]Loading weights:  24%|██▌        | 80/338 [00:00<00:00, 1027.77it/s, Materializing param=model.layers.6.self_attn.k_proj.weight]Loading weights:  24%|██▌        | 80/338 [00:00<00:00, 1024.23it/s, Materializing param=model.layers.6.self_attn.k_proj.weight]Loading weights:  24%|██▋        | 81/338 [00:00<00:00, 1031.43it/s, Materializing param=model.layers.6.self_attn.o_proj.weight]Loading weights:  24%|██▋        | 81/338 [00:00<00:00, 1028.19it/s, Materializing param=model.layers.6.self_attn.o_proj.weight]Loading weights:  24%|███▏         | 82/338 [00:00<00:00, 1034.48it/s, Materializing param=model.layers.6.self_attn.q_proj.bias]Loading weights:  24%|███▏         | 82/338 [00:00<00:00, 1019.48it/s, Materializing param=model.layers.6.self_attn.q_proj.bias]Loading weights:  25%|██▋        | 83/338 [00:00<00:00, 1025.89it/s, Materializing param=model.layers.6.self_attn.q_proj.weight]Loading weights:  25%|██▋        | 83/338 [00:00<00:00, 1013.65it/s, Materializing param=model.layers.6.self_attn.q_proj.weight]Loading weights:  25%|███▏         | 84/338 [00:00<00:00, 1020.44it/s, Materializing param=model.layers.6.self_attn.v_proj.bias]Loading weights:  25%|███▏         | 84/338 [00:00<00:00, 1012.97it/s, Materializing param=model.layers.6.self_attn.v_proj.bias]Loading weights:  25%|██▊        | 85/338 [00:00<00:00, 1014.92it/s, Materializing param=model.layers.6.self_attn.v_proj.weight]Loading weights:  25%|██▊        | 85/338 [00:00<00:00, 1007.27it/s, Materializing param=model.layers.6.self_attn.v_proj.weight]Loading weights:  25%|███         | 86/338 [00:00<00:00, 1014.22it/s, Materializing param=model.layers.7.input_layernorm.weight]Loading weights:  25%|███         | 86/338 [00:00<00:00, 1011.30it/s, Materializing param=model.layers.7.input_layernorm.weight]Loading weights:  26%|███▌          | 87/338 [00:00<00:00, 1016.43it/s, Materializing param=model.layers.7.mlp.down_proj.weight]Loading weights:  26%|███▌          | 87/338 [00:00<00:00, 1009.81it/s, Materializing param=model.layers.7.mlp.down_proj.weight]Loading weights:  26%|███▋          | 88/338 [00:00<00:00, 1014.94it/s, Materializing param=model.layers.7.mlp.gate_proj.weight]Loading weights:  26%|███▋          | 88/338 [00:00<00:00, 1010.44it/s, Materializing param=model.layers.7.mlp.gate_proj.weight]Loading weights:  26%|████▏           | 89/338 [00:00<00:00, 1012.47it/s, Materializing param=model.layers.7.mlp.up_proj.weight]Loading weights:  26%|████▏           | 89/338 [00:00<00:00, 1005.50it/s, Materializing param=model.layers.7.mlp.up_proj.weight]Loading weights:  27%|▊  | 90/338 [00:00<00:00, 1004.19it/s, Materializing param=model.layers.7.post_attention_layernorm.weight]Loading weights:  27%|█   | 90/338 [00:00<00:00, 996.81it/s, Materializing param=model.layers.7.post_attention_layernorm.weight]Loading weights:  27%|███▌         | 91/338 [00:00<00:00, 1002.37it/s, Materializing param=model.layers.7.self_attn.k_proj.bias]Loading weights:  27%|███▊          | 91/338 [00:00<00:00, 999.71it/s, Materializing param=model.layers.7.self_attn.k_proj.bias]Loading weights:  27%|██▉        | 92/338 [00:00<00:00, 1006.06it/s, Materializing param=model.layers.7.self_attn.k_proj.weight]Loading weights:  27%|██▉        | 92/338 [00:00<00:00, 1003.34it/s, Materializing param=model.layers.7.self_attn.k_proj.weight]Loading weights:  28%|███        | 93/338 [00:00<00:00, 1007.90it/s, Materializing param=model.layers.7.self_attn.o_proj.weight]Loading weights:  28%|███        | 93/338 [00:00<00:00, 1002.45it/s, Materializing param=model.layers.7.self_attn.o_proj.weight]Loading weights:  28%|███▌         | 94/338 [00:00<00:00, 1007.29it/s, Materializing param=model.layers.7.self_attn.q_proj.bias]Loading weights:  28%|███▉          | 94/338 [00:00<00:00, 997.25it/s, Materializing param=model.layers.7.self_attn.q_proj.bias]Loading weights:  28%|███▎        | 95/338 [00:00<00:00, 999.85it/s, Materializing param=model.layers.7.self_attn.q_proj.weight]Loading weights:  28%|███▎        | 95/338 [00:00<00:00, 996.00it/s, Materializing param=model.layers.7.self_attn.q_proj.weight]Loading weights:  28%|███▋         | 96/338 [00:00<00:00, 1000.94it/s, Materializing param=model.layers.7.self_attn.v_proj.bias]Loading weights:  28%|███▉          | 96/338 [00:00<00:00, 998.35it/s, Materializing param=model.layers.7.self_attn.v_proj.bias]Loading weights:  29%|███▏       | 97/338 [00:00<00:00, 1004.80it/s, Materializing param=model.layers.7.self_attn.v_proj.weight]Loading weights:  29%|███▏       | 97/338 [00:00<00:00, 1000.48it/s, Materializing param=model.layers.7.self_attn.v_proj.weight]Loading weights:  29%|███▊         | 98/338 [00:00<00:00, 998.25it/s, Materializing param=model.layers.8.input_layernorm.weight]Loading weights:  29%|███▊         | 98/338 [00:00<00:00, 995.66it/s, Materializing param=model.layers.8.input_layernorm.weight]Loading weights:  29%|████▍          | 99/338 [00:00<00:00, 997.70it/s, Materializing param=model.layers.8.mlp.down_proj.weight]Loading weights:  29%|████▍          | 99/338 [00:00<00:00, 988.10it/s, Materializing param=model.layers.8.mlp.down_proj.weight]Loading weights:  30%|████▏         | 100/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.8.mlp.down_proj.weight]Loading weights:  30%|████▏         | 100/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.8.mlp.gate_proj.weight]Loading weights:  30%|████▏         | 100/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.8.mlp.gate_proj.weight]Loading weights:  30%|████▊           | 101/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.8.mlp.up_proj.weight]Loading weights:  30%|████▊           | 101/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.8.mlp.up_proj.weight]Loading weights:  30%|▉  | 102/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.8.post_attention_layernorm.weight]Loading weights:  30%|▉  | 102/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.8.post_attention_layernorm.weight]Loading weights:  30%|███▉         | 103/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.8.self_attn.k_proj.bias]Loading weights:  30%|███▉         | 103/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.8.self_attn.k_proj.bias]Loading weights:  31%|███▍       | 104/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.8.self_attn.k_proj.weight]Loading weights:  31%|███▍       | 104/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.8.self_attn.k_proj.weight]Loading weights:  31%|███▍       | 105/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.8.self_attn.o_proj.weight]Loading weights:  31%|███▍       | 105/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.8.self_attn.o_proj.weight]Loading weights:  31%|████         | 106/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.8.self_attn.q_proj.bias]Loading weights:  31%|████         | 106/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.8.self_attn.q_proj.bias]Loading weights:  32%|███▍       | 107/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.8.self_attn.q_proj.weight]Loading weights:  32%|███▍       | 107/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.8.self_attn.q_proj.weight]Loading weights:  32%|████▏        | 108/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.8.self_attn.v_proj.bias]Loading weights:  32%|████▏        | 108/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.8.self_attn.v_proj.bias]Loading weights:  32%|███▌       | 109/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.8.self_attn.v_proj.weight]Loading weights:  32%|███▌       | 109/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.8.self_attn.v_proj.weight]Loading weights:  33%|███▉        | 110/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.input_layernorm.weight]Loading weights:  33%|███▉        | 110/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.input_layernorm.weight]Loading weights:  33%|████▌         | 111/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.mlp.down_proj.weight]Loading weights:  33%|████▌         | 111/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.mlp.down_proj.weight]Loading weights:  33%|████▋         | 112/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.mlp.gate_proj.weight]Loading weights:  33%|████▋         | 112/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.mlp.gate_proj.weight]Loading weights:  33%|█████▎          | 113/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.mlp.up_proj.weight]Loading weights:  33%|█████▎          | 113/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.mlp.up_proj.weight]Loading weights:  34%|█  | 114/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.post_attention_layernorm.weight]Loading weights:  34%|█  | 114/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.post_attention_layernorm.weight]Loading weights:  34%|████▍        | 115/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.self_attn.k_proj.bias]Loading weights:  34%|████▍        | 115/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.self_attn.k_proj.bias]Loading weights:  34%|███▊       | 116/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.self_attn.k_proj.weight]Loading weights:  34%|███▊       | 116/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.self_attn.k_proj.weight]Loading weights:  35%|███▊       | 117/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.self_attn.o_proj.weight]Loading weights:  35%|███▊       | 117/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.self_attn.o_proj.weight]Loading weights:  35%|████▌        | 118/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.self_attn.q_proj.bias]Loading weights:  35%|████▌        | 118/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.self_attn.q_proj.bias]Loading weights:  35%|███▊       | 119/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.self_attn.q_proj.weight]Loading weights:  35%|███▊       | 119/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.self_attn.q_proj.weight]Loading weights:  36%|████▌        | 120/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.self_attn.v_proj.bias]Loading weights:  36%|████▌        | 120/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.self_attn.v_proj.bias]Loading weights:  36%|███▉       | 121/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.self_attn.v_proj.weight]Loading weights:  36%|███▉       | 121/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.9.self_attn.v_proj.weight]Loading weights:  36%|███▉       | 122/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.input_layernorm.weight]Loading weights:  36%|███▉       | 122/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.input_layernorm.weight]Loading weights:  36%|████▋        | 123/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.mlp.down_proj.weight]Loading weights:  36%|████▋        | 123/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.mlp.down_proj.weight]Loading weights:  37%|████▊        | 124/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.mlp.gate_proj.weight]Loading weights:  37%|████▊        | 124/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.mlp.gate_proj.weight]Loading weights:  37%|█████▌         | 125/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.mlp.up_proj.weight]Loading weights:  37%|█████▌         | 125/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.mlp.up_proj.weight]Loading weights:  37%|▋ | 126/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.post_attention_layernorm.weight]Loading weights:  37%|▋ | 126/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.post_attention_layernorm.weight]Loading weights:  38%|████▌       | 127/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.self_attn.k_proj.bias]Loading weights:  38%|████▌       | 127/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.self_attn.k_proj.bias]Loading weights:  38%|███▊      | 128/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.self_attn.k_proj.weight]Loading weights:  38%|███▊      | 128/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.self_attn.k_proj.weight]Loading weights:  38%|███▊      | 129/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.self_attn.o_proj.weight]Loading weights:  38%|███▊      | 129/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.self_attn.o_proj.weight]Loading weights:  38%|████▌       | 130/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.self_attn.q_proj.bias]Loading weights:  38%|████▌       | 130/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.self_attn.q_proj.bias]Loading weights:  39%|███▉      | 131/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.self_attn.q_proj.weight]Loading weights:  39%|███▉      | 131/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.self_attn.q_proj.weight]Loading weights:  39%|████▋       | 132/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.self_attn.v_proj.bias]Loading weights:  39%|████▋       | 132/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.self_attn.v_proj.bias]Loading weights:  39%|███▉      | 133/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.self_attn.v_proj.weight]Loading weights:  39%|███▉      | 133/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.10.self_attn.v_proj.weight]Loading weights:  40%|████▎      | 134/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.input_layernorm.weight]Loading weights:  40%|████▎      | 134/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.input_layernorm.weight]Loading weights:  40%|█████▏       | 135/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.mlp.down_proj.weight]Loading weights:  40%|█████▏       | 135/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.mlp.down_proj.weight]Loading weights:  40%|█████▏       | 136/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.mlp.gate_proj.weight]Loading weights:  40%|█████▏       | 136/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.mlp.gate_proj.weight]Loading weights:  41%|██████         | 137/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.mlp.up_proj.weight]Loading weights:  41%|██████         | 137/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.mlp.up_proj.weight]Loading weights:  41%|▊ | 138/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.post_attention_layernorm.weight]Loading weights:  41%|▊ | 138/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.post_attention_layernorm.weight]Loading weights:  41%|████▉       | 139/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.self_attn.k_proj.bias]Loading weights:  41%|████▉       | 139/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.self_attn.k_proj.bias]Loading weights:  41%|████▏     | 140/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.self_attn.k_proj.weight]Loading weights:  41%|████▏     | 140/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.self_attn.k_proj.weight]Loading weights:  42%|████▏     | 141/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.self_attn.o_proj.weight]Loading weights:  42%|████▏     | 141/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.self_attn.o_proj.weight]Loading weights:  42%|█████       | 142/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.self_attn.q_proj.bias]Loading weights:  42%|█████       | 142/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.self_attn.q_proj.bias]Loading weights:  42%|████▏     | 143/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.self_attn.q_proj.weight]Loading weights:  42%|████▏     | 143/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.self_attn.q_proj.weight]Loading weights:  43%|█████       | 144/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.self_attn.v_proj.bias]Loading weights:  43%|█████       | 144/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.self_attn.v_proj.bias]Loading weights:  43%|████▎     | 145/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.self_attn.v_proj.weight]Loading weights:  43%|████▎     | 145/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.11.self_attn.v_proj.weight]Loading weights:  43%|████▊      | 146/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.input_layernorm.weight]Loading weights:  43%|████▊      | 146/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.input_layernorm.weight]Loading weights:  43%|█████▋       | 147/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.mlp.down_proj.weight]Loading weights:  43%|█████▋       | 147/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.mlp.down_proj.weight]Loading weights:  44%|█████▋       | 148/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.mlp.gate_proj.weight]Loading weights:  44%|█████▋       | 148/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.mlp.gate_proj.weight]Loading weights:  44%|██████▌        | 149/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.mlp.up_proj.weight]Loading weights:  44%|██████▌        | 149/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.mlp.up_proj.weight]Loading weights:  44%|▉ | 150/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.post_attention_layernorm.weight]Loading weights:  44%|▉ | 150/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.post_attention_layernorm.weight]Loading weights:  45%|█████▎      | 151/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.self_attn.k_proj.bias]Loading weights:  45%|█████▎      | 151/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.self_attn.k_proj.bias]Loading weights:  45%|████▍     | 152/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.self_attn.k_proj.weight]Loading weights:  45%|████▍     | 152/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.self_attn.k_proj.weight]Loading weights:  45%|████▌     | 153/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.self_attn.o_proj.weight]Loading weights:  45%|████▌     | 153/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.self_attn.o_proj.weight]Loading weights:  46%|█████▍      | 154/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.self_attn.q_proj.bias]Loading weights:  46%|█████▍      | 154/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.self_attn.q_proj.bias]Loading weights:  46%|████▌     | 155/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.self_attn.q_proj.weight]Loading weights:  46%|████▌     | 155/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.self_attn.q_proj.weight]Loading weights:  46%|█████▌      | 156/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.self_attn.v_proj.bias]Loading weights:  46%|█████▌      | 156/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.self_attn.v_proj.bias]Loading weights:  46%|████▋     | 157/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.self_attn.v_proj.weight]Loading weights:  46%|████▋     | 157/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.12.self_attn.v_proj.weight]Loading weights:  47%|█████▏     | 158/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.input_layernorm.weight]Loading weights:  47%|█████▏     | 158/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.input_layernorm.weight]Loading weights:  47%|██████       | 159/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.mlp.down_proj.weight]Loading weights:  47%|██████       | 159/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.mlp.down_proj.weight]Loading weights:  47%|██████▏      | 160/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.mlp.gate_proj.weight]Loading weights:  47%|██████▏      | 160/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.mlp.gate_proj.weight]Loading weights:  48%|███████▏       | 161/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.mlp.up_proj.weight]Loading weights:  48%|███████▏       | 161/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.mlp.up_proj.weight]Loading weights:  48%|▉ | 162/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.post_attention_layernorm.weight]Loading weights:  48%|▉ | 162/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.post_attention_layernorm.weight]Loading weights:  48%|█████▊      | 163/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.self_attn.k_proj.bias]Loading weights:  48%|█████▊      | 163/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.self_attn.k_proj.bias]Loading weights:  49%|████▊     | 164/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.self_attn.k_proj.weight]Loading weights:  49%|████▊     | 164/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.self_attn.k_proj.weight]Loading weights:  49%|████▉     | 165/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.self_attn.o_proj.weight]Loading weights:  49%|████▉     | 165/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.self_attn.o_proj.weight]Loading weights:  49%|█████▉      | 166/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.self_attn.q_proj.bias]Loading weights:  49%|█████▉      | 166/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.self_attn.q_proj.bias]Loading weights:  49%|████▉     | 167/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.self_attn.q_proj.weight]Loading weights:  49%|████▉     | 167/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.self_attn.q_proj.weight]Loading weights:  50%|█████▉      | 168/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.self_attn.v_proj.bias]Loading weights:  50%|█████▉      | 168/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.self_attn.v_proj.bias]Loading weights:  50%|█████     | 169/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.self_attn.v_proj.weight]Loading weights:  50%|█████     | 169/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.13.self_attn.v_proj.weight]Loading weights:  50%|█████▌     | 170/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.input_layernorm.weight]Loading weights:  50%|█████▌     | 170/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.input_layernorm.weight]Loading weights:  51%|██████▌      | 171/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.mlp.down_proj.weight]Loading weights:  51%|██████▌      | 171/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.mlp.down_proj.weight]Loading weights:  51%|██████▌      | 172/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.mlp.gate_proj.weight]Loading weights:  51%|██████▌      | 172/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.mlp.gate_proj.weight]Loading weights:  51%|███████▋       | 173/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.mlp.up_proj.weight]Loading weights:  51%|███████▋       | 173/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.mlp.up_proj.weight]Loading weights:  51%|█ | 174/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.post_attention_layernorm.weight]Loading weights:  51%|█ | 174/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.post_attention_layernorm.weight]Loading weights:  52%|██████▏     | 175/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.self_attn.k_proj.bias]Loading weights:  52%|██████▏     | 175/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.self_attn.k_proj.bias]Loading weights:  52%|█████▏    | 176/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.self_attn.k_proj.weight]Loading weights:  52%|█████▏    | 176/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.self_attn.k_proj.weight]Loading weights:  52%|█████▏    | 177/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.self_attn.o_proj.weight]Loading weights:  52%|█████▏    | 177/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.self_attn.o_proj.weight]Loading weights:  53%|██████▎     | 178/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.self_attn.q_proj.bias]Loading weights:  53%|██████▎     | 178/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.self_attn.q_proj.bias]Loading weights:  53%|█████▎    | 179/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.self_attn.q_proj.weight]Loading weights:  53%|█████▎    | 179/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.self_attn.q_proj.weight]Loading weights:  53%|██████▍     | 180/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.self_attn.v_proj.bias]Loading weights:  53%|██████▍     | 180/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.self_attn.v_proj.bias]Loading weights:  54%|█████▎    | 181/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.self_attn.v_proj.weight]Loading weights:  54%|█████▎    | 181/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.14.self_attn.v_proj.weight]Loading weights:  54%|█████▉     | 182/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.input_layernorm.weight]Loading weights:  54%|█████▉     | 182/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.input_layernorm.weight]Loading weights:  54%|███████      | 183/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.mlp.down_proj.weight]Loading weights:  54%|███████      | 183/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.mlp.down_proj.weight]Loading weights:  54%|███████      | 184/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.mlp.gate_proj.weight]Loading weights:  54%|███████      | 184/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.mlp.gate_proj.weight]Loading weights:  55%|████████▏      | 185/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.mlp.up_proj.weight]Loading weights:  55%|████████▏      | 185/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.mlp.up_proj.weight]Loading weights:  55%|█ | 186/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.post_attention_layernorm.weight]Loading weights:  55%|█ | 186/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.post_attention_layernorm.weight]Loading weights:  55%|██████▋     | 187/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.self_attn.k_proj.bias]Loading weights:  55%|██████▋     | 187/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.self_attn.k_proj.bias]Loading weights:  56%|█████▌    | 188/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.self_attn.k_proj.weight]Loading weights:  56%|█████▌    | 188/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.self_attn.k_proj.weight]Loading weights:  56%|█████▌    | 189/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.self_attn.o_proj.weight]Loading weights:  56%|█████▌    | 189/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.self_attn.o_proj.weight]Loading weights:  56%|██████▋     | 190/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.self_attn.q_proj.bias]Loading weights:  56%|██████▋     | 190/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.self_attn.q_proj.bias]Loading weights:  57%|█████▋    | 191/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.self_attn.q_proj.weight]Loading weights:  57%|█████▋    | 191/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.self_attn.q_proj.weight]Loading weights:  57%|██████▊     | 192/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.self_attn.v_proj.bias]Loading weights:  57%|██████▊     | 192/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.self_attn.v_proj.bias]Loading weights:  57%|█████▋    | 193/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.self_attn.v_proj.weight]Loading weights:  57%|█████▋    | 193/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.15.self_attn.v_proj.weight]Loading weights:  57%|██████▎    | 194/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.input_layernorm.weight]Loading weights:  57%|██████▎    | 194/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.input_layernorm.weight]Loading weights:  58%|███████▍     | 195/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.mlp.down_proj.weight]Loading weights:  58%|███████▍     | 195/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.mlp.down_proj.weight]Loading weights:  58%|███████▌     | 196/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.mlp.gate_proj.weight]Loading weights:  58%|███████▌     | 196/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.mlp.gate_proj.weight]Loading weights:  58%|████████▋      | 197/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.mlp.up_proj.weight]Loading weights:  58%|████████▋      | 197/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.mlp.up_proj.weight]Loading weights:  59%|█▏| 198/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.post_attention_layernorm.weight]Loading weights:  59%|█▏| 198/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.post_attention_layernorm.weight]Loading weights:  59%|███████     | 199/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.self_attn.k_proj.bias]Loading weights:  59%|███████     | 199/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.self_attn.k_proj.bias]Loading weights:  59%|█████▉    | 200/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.self_attn.k_proj.weight]Loading weights:  59%|█████▉    | 200/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.self_attn.k_proj.weight]Loading weights:  59%|█████▉    | 201/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.self_attn.o_proj.weight]Loading weights:  59%|█████▉    | 201/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.self_attn.o_proj.weight]Loading weights:  60%|███████▏    | 202/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.self_attn.q_proj.bias]Loading weights:  60%|███████▏    | 202/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.self_attn.q_proj.bias]Loading weights:  60%|██████    | 203/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.self_attn.q_proj.weight]Loading weights:  60%|██████    | 203/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.self_attn.q_proj.weight]Loading weights:  60%|███████▏    | 204/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.self_attn.v_proj.bias]Loading weights:  60%|███████▏    | 204/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.self_attn.v_proj.bias]Loading weights:  61%|██████    | 205/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.self_attn.v_proj.weight]Loading weights:  61%|██████    | 205/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.16.self_attn.v_proj.weight]Loading weights:  61%|██████▋    | 206/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.input_layernorm.weight]Loading weights:  61%|██████▋    | 206/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.input_layernorm.weight]Loading weights:  61%|███████▉     | 207/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.mlp.down_proj.weight]Loading weights:  61%|███████▉     | 207/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.mlp.down_proj.weight]Loading weights:  62%|████████     | 208/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.mlp.gate_proj.weight]Loading weights:  62%|████████     | 208/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.mlp.gate_proj.weight]Loading weights:  62%|█████████▎     | 209/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.mlp.up_proj.weight]Loading weights:  62%|█████████▎     | 209/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.mlp.up_proj.weight]Loading weights:  62%|█▏| 210/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.post_attention_layernorm.weight]Loading weights:  62%|█▏| 210/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.post_attention_layernorm.weight]Loading weights:  62%|███████▍    | 211/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.self_attn.k_proj.bias]Loading weights:  62%|███████▍    | 211/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.self_attn.k_proj.bias]Loading weights:  63%|██████▎   | 212/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.self_attn.k_proj.weight]Loading weights:  63%|██████▎   | 212/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.self_attn.k_proj.weight]Loading weights:  63%|██████▎   | 213/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.self_attn.o_proj.weight]Loading weights:  63%|██████▎   | 213/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.self_attn.o_proj.weight]Loading weights:  63%|███████▌    | 214/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.self_attn.q_proj.bias]Loading weights:  63%|███████▌    | 214/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.self_attn.q_proj.bias]Loading weights:  64%|██████▎   | 215/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.self_attn.q_proj.weight]Loading weights:  64%|██████▎   | 215/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.self_attn.q_proj.weight]Loading weights:  64%|███████▋    | 216/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.self_attn.v_proj.bias]Loading weights:  64%|███████▋    | 216/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.self_attn.v_proj.bias]Loading weights:  64%|██████▍   | 217/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.self_attn.v_proj.weight]Loading weights:  64%|██████▍   | 217/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.17.self_attn.v_proj.weight]Loading weights:  64%|███████    | 218/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.input_layernorm.weight]Loading weights:  64%|███████    | 218/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.input_layernorm.weight]Loading weights:  65%|████████▍    | 219/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.mlp.down_proj.weight]Loading weights:  65%|████████▍    | 219/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.mlp.down_proj.weight]Loading weights:  65%|████████▍    | 220/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.mlp.gate_proj.weight]Loading weights:  65%|████████▍    | 220/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.mlp.gate_proj.weight]Loading weights:  65%|█████████▊     | 221/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.mlp.up_proj.weight]Loading weights:  65%|█████████▊     | 221/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.mlp.up_proj.weight]Loading weights:  66%|█▎| 222/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.post_attention_layernorm.weight]Loading weights:  66%|█▎| 222/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.post_attention_layernorm.weight]Loading weights:  66%|███████▉    | 223/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.self_attn.k_proj.bias]Loading weights:  66%|███████▉    | 223/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.self_attn.k_proj.bias]Loading weights:  66%|██████▋   | 224/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.self_attn.k_proj.weight]Loading weights:  66%|██████▋   | 224/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.self_attn.k_proj.weight]Loading weights:  67%|██████▋   | 225/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.self_attn.o_proj.weight]Loading weights:  67%|██████▋   | 225/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.self_attn.o_proj.weight]Loading weights:  67%|████████    | 226/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.self_attn.q_proj.bias]Loading weights:  67%|████████    | 226/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.self_attn.q_proj.bias]Loading weights:  67%|██████▋   | 227/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.self_attn.q_proj.weight]Loading weights:  67%|██████▋   | 227/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.self_attn.q_proj.weight]Loading weights:  67%|████████    | 228/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.self_attn.v_proj.bias]Loading weights:  67%|████████    | 228/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.self_attn.v_proj.bias]Loading weights:  68%|██████▊   | 229/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.self_attn.v_proj.weight]Loading weights:  68%|██████▊   | 229/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.18.self_attn.v_proj.weight]Loading weights:  68%|███████▍   | 230/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.input_layernorm.weight]Loading weights:  68%|███████▍   | 230/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.input_layernorm.weight]Loading weights:  68%|████████▉    | 231/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.mlp.down_proj.weight]Loading weights:  68%|████████▉    | 231/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.mlp.down_proj.weight]Loading weights:  69%|████████▉    | 232/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.mlp.gate_proj.weight]Loading weights:  69%|████████▉    | 232/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.mlp.gate_proj.weight]Loading weights:  69%|██████████▎    | 233/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.mlp.up_proj.weight]Loading weights:  69%|██████████▎    | 233/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.mlp.up_proj.weight]Loading weights:  69%|█▍| 234/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.post_attention_layernorm.weight]Loading weights:  69%|█▍| 234/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.post_attention_layernorm.weight]Loading weights:  70%|████████▎   | 235/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.self_attn.k_proj.bias]Loading weights:  70%|████████▎   | 235/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.self_attn.k_proj.bias]Loading weights:  70%|██████▉   | 236/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.self_attn.k_proj.weight]Loading weights:  70%|██████▉   | 236/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.self_attn.k_proj.weight]Loading weights:  70%|███████   | 237/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.self_attn.o_proj.weight]Loading weights:  70%|███████   | 237/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.self_attn.o_proj.weight]Loading weights:  70%|████████▍   | 238/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.self_attn.q_proj.bias]Loading weights:  70%|████████▍   | 238/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.self_attn.q_proj.bias]Loading weights:  71%|███████   | 239/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.self_attn.q_proj.weight]Loading weights:  71%|███████   | 239/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.self_attn.q_proj.weight]Loading weights:  71%|████████▌   | 240/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.self_attn.v_proj.bias]Loading weights:  71%|████████▌   | 240/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.self_attn.v_proj.bias]Loading weights:  71%|███████▏  | 241/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.self_attn.v_proj.weight]Loading weights:  71%|███████▏  | 241/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.19.self_attn.v_proj.weight]Loading weights:  72%|███████▉   | 242/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.input_layernorm.weight]Loading weights:  72%|███████▉   | 242/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.input_layernorm.weight]Loading weights:  72%|█████████▎   | 243/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.mlp.down_proj.weight]Loading weights:  72%|█████████▎   | 243/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.mlp.down_proj.weight]Loading weights:  72%|█████████▍   | 244/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.mlp.gate_proj.weight]Loading weights:  72%|█████████▍   | 244/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.mlp.gate_proj.weight]Loading weights:  72%|██████████▊    | 245/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.mlp.up_proj.weight]Loading weights:  72%|██████████▊    | 245/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.mlp.up_proj.weight]Loading weights:  73%|█▍| 246/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.post_attention_layernorm.weight]Loading weights:  73%|█▍| 246/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.post_attention_layernorm.weight]Loading weights:  73%|████████▊   | 247/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.self_attn.k_proj.bias]Loading weights:  73%|████████▊   | 247/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.self_attn.k_proj.bias]Loading weights:  73%|███████▎  | 248/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.self_attn.k_proj.weight]Loading weights:  73%|███████▎  | 248/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.self_attn.k_proj.weight]Loading weights:  74%|███████▎  | 249/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.self_attn.o_proj.weight]Loading weights:  74%|███████▎  | 249/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.self_attn.o_proj.weight]Loading weights:  74%|████████▉   | 250/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.self_attn.q_proj.bias]Loading weights:  74%|████████▉   | 250/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.self_attn.q_proj.bias]Loading weights:  74%|███████▍  | 251/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.self_attn.q_proj.weight]Loading weights:  74%|███████▍  | 251/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.self_attn.q_proj.weight]Loading weights:  75%|████████▉   | 252/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.self_attn.v_proj.bias]Loading weights:  75%|████████▉   | 252/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.self_attn.v_proj.bias]Loading weights:  75%|███████▍  | 253/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.self_attn.v_proj.weight]Loading weights:  75%|███████▍  | 253/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.20.self_attn.v_proj.weight]Loading weights:  75%|████████▎  | 254/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.input_layernorm.weight]Loading weights:  75%|████████▎  | 254/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.input_layernorm.weight]Loading weights:  75%|█████████▊   | 255/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.mlp.down_proj.weight]Loading weights:  75%|█████████▊   | 255/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.mlp.down_proj.weight]Loading weights:  76%|█████████▊   | 256/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.mlp.gate_proj.weight]Loading weights:  76%|█████████▊   | 256/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.mlp.gate_proj.weight]Loading weights:  76%|███████████▍   | 257/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.mlp.up_proj.weight]Loading weights:  76%|███████████▍   | 257/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.mlp.up_proj.weight]Loading weights:  76%|█▌| 258/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.post_attention_layernorm.weight]Loading weights:  76%|█▌| 258/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.post_attention_layernorm.weight]Loading weights:  77%|█████████▏  | 259/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.self_attn.k_proj.bias]Loading weights:  77%|█████████▏  | 259/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.self_attn.k_proj.bias]Loading weights:  77%|███████▋  | 260/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.self_attn.k_proj.weight]Loading weights:  77%|███████▋  | 260/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.self_attn.k_proj.weight]Loading weights:  77%|███████▋  | 261/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.self_attn.o_proj.weight]Loading weights:  77%|███████▋  | 261/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.self_attn.o_proj.weight]Loading weights:  78%|█████████▎  | 262/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.self_attn.q_proj.bias]Loading weights:  78%|█████████▎  | 262/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.self_attn.q_proj.bias]Loading weights:  78%|███████▊  | 263/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.self_attn.q_proj.weight]Loading weights:  78%|███████▊  | 263/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.self_attn.q_proj.weight]Loading weights:  78%|█████████▎  | 264/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.self_attn.v_proj.bias]Loading weights:  78%|█████████▎  | 264/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.self_attn.v_proj.bias]Loading weights:  78%|███████▊  | 265/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.self_attn.v_proj.weight]Loading weights:  78%|███████▊  | 265/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.21.self_attn.v_proj.weight]Loading weights:  79%|████████▋  | 266/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.input_layernorm.weight]Loading weights:  79%|████████▋  | 266/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.input_layernorm.weight]Loading weights:  79%|██████████▎  | 267/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.mlp.down_proj.weight]Loading weights:  79%|██████████▎  | 267/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.mlp.down_proj.weight]Loading weights:  79%|██████████▎  | 268/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.mlp.gate_proj.weight]Loading weights:  79%|██████████▎  | 268/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.mlp.gate_proj.weight]Loading weights:  80%|███████████▉   | 269/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.mlp.up_proj.weight]Loading weights:  80%|███████████▉   | 269/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.mlp.up_proj.weight]Loading weights:  80%|█▌| 270/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.post_attention_layernorm.weight]Loading weights:  80%|█▌| 270/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.post_attention_layernorm.weight]Loading weights:  80%|█████████▌  | 271/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.self_attn.k_proj.bias]Loading weights:  80%|█████████▌  | 271/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.self_attn.k_proj.bias]Loading weights:  80%|████████  | 272/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.self_attn.k_proj.weight]Loading weights:  80%|████████  | 272/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.self_attn.k_proj.weight]Loading weights:  81%|████████  | 273/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.self_attn.o_proj.weight]Loading weights:  81%|████████  | 273/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.self_attn.o_proj.weight]Loading weights:  81%|█████████▋  | 274/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.self_attn.q_proj.bias]Loading weights:  81%|█████████▋  | 274/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.self_attn.q_proj.bias]Loading weights:  81%|████████▏ | 275/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.self_attn.q_proj.weight]Loading weights:  81%|████████▏ | 275/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.self_attn.q_proj.weight]Loading weights:  82%|█████████▊  | 276/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.self_attn.v_proj.bias]Loading weights:  82%|█████████▊  | 276/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.self_attn.v_proj.bias]Loading weights:  82%|████████▏ | 277/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.self_attn.v_proj.weight]Loading weights:  82%|████████▏ | 277/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.22.self_attn.v_proj.weight]Loading weights:  82%|█████████  | 278/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.input_layernorm.weight]Loading weights:  82%|█████████  | 278/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.input_layernorm.weight]Loading weights:  83%|██████████▋  | 279/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.mlp.down_proj.weight]Loading weights:  83%|██████████▋  | 279/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.mlp.down_proj.weight]Loading weights:  83%|██████████▊  | 280/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.mlp.gate_proj.weight]Loading weights:  83%|██████████▊  | 280/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.mlp.gate_proj.weight]Loading weights:  83%|████████████▍  | 281/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.mlp.up_proj.weight]Loading weights:  83%|████████████▍  | 281/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.mlp.up_proj.weight]Loading weights:  83%|█▋| 282/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.post_attention_layernorm.weight]Loading weights:  83%|█▋| 282/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.post_attention_layernorm.weight]Loading weights:  84%|██████████  | 283/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.self_attn.k_proj.bias]Loading weights:  84%|██████████  | 283/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.self_attn.k_proj.bias]Loading weights:  84%|████████▍ | 284/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.self_attn.k_proj.weight]Loading weights:  84%|████████▍ | 284/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.self_attn.k_proj.weight]Loading weights:  84%|████████▍ | 285/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.self_attn.o_proj.weight]Loading weights:  84%|████████▍ | 285/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.self_attn.o_proj.weight]Loading weights:  85%|██████████▏ | 286/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.self_attn.q_proj.bias]Loading weights:  85%|██████████▏ | 286/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.self_attn.q_proj.bias]Loading weights:  85%|████████▍ | 287/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.self_attn.q_proj.weight]Loading weights:  85%|████████▍ | 287/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.self_attn.q_proj.weight]Loading weights:  85%|██████████▏ | 288/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.self_attn.v_proj.bias]Loading weights:  85%|██████████▏ | 288/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.self_attn.v_proj.bias]Loading weights:  86%|████████▌ | 289/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.self_attn.v_proj.weight]Loading weights:  86%|████████▌ | 289/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.23.self_attn.v_proj.weight]Loading weights:  86%|█████████▍ | 290/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.input_layernorm.weight]Loading weights:  86%|█████████▍ | 290/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.input_layernorm.weight]Loading weights:  86%|███████████▏ | 291/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.mlp.down_proj.weight]Loading weights:  86%|███████████▏ | 291/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.mlp.down_proj.weight]Loading weights:  86%|███████████▏ | 292/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.mlp.gate_proj.weight]Loading weights:  86%|███████████▏ | 292/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.mlp.gate_proj.weight]Loading weights:  87%|█████████████  | 293/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.mlp.up_proj.weight]Loading weights:  87%|█████████████  | 293/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.mlp.up_proj.weight]Loading weights:  87%|█▋| 294/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.post_attention_layernorm.weight]Loading weights:  87%|█▋| 294/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.post_attention_layernorm.weight]Loading weights:  87%|██████████▍ | 295/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.self_attn.k_proj.bias]Loading weights:  87%|██████████▍ | 295/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.self_attn.k_proj.bias]Loading weights:  88%|████████▊ | 296/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.self_attn.k_proj.weight]Loading weights:  88%|████████▊ | 296/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.self_attn.k_proj.weight]Loading weights:  88%|████████▊ | 297/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.self_attn.o_proj.weight]Loading weights:  88%|████████▊ | 297/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.self_attn.o_proj.weight]Loading weights:  88%|██████████▌ | 298/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.self_attn.q_proj.bias]Loading weights:  88%|██████████▌ | 298/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.self_attn.q_proj.bias]Loading weights:  88%|████████▊ | 299/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.self_attn.q_proj.weight]Loading weights:  88%|████████▊ | 299/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.self_attn.q_proj.weight]Loading weights:  89%|██████████▋ | 300/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.self_attn.v_proj.bias]Loading weights:  89%|██████████▋ | 300/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.self_attn.v_proj.bias]Loading weights:  89%|████████▉ | 301/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.self_attn.v_proj.weight]Loading weights:  89%|████████▉ | 301/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.24.self_attn.v_proj.weight]Loading weights:  89%|█████████▊ | 302/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.25.input_layernorm.weight]Loading weights:  89%|█████████▊ | 302/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.25.input_layernorm.weight]Loading weights:  90%|███████████▋ | 303/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.25.mlp.down_proj.weight]Loading weights:  90%|███████████▋ | 303/338 [00:00<00:00, 989.02it/s, Materializing param=model.layers.25.mlp.down_proj.weight]Loading weights:  90%|██████████▊ | 304/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.25.mlp.down_proj.weight]Loading weights:  90%|██████████▊ | 304/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.25.mlp.gate_proj.weight]Loading weights:  90%|██████████▊ | 304/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.25.mlp.gate_proj.weight]Loading weights:  90%|████████████▋ | 305/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.25.mlp.up_proj.weight]Loading weights:  90%|████████████▋ | 305/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.25.mlp.up_proj.weight]Loading weights:  91%|▉| 306/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.25.post_attention_layernorm.weight]Loading weights:  91%|▉| 306/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.25.post_attention_layernorm.weight]Loading weights:  91%|█████████▉ | 307/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.25.self_attn.k_proj.bias]Loading weights:  91%|█████████▉ | 307/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.25.self_attn.k_proj.bias]Loading weights:  91%|████████▏| 308/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.25.self_attn.k_proj.weight]Loading weights:  91%|████████▏| 308/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.25.self_attn.k_proj.weight]Loading weights:  91%|████████▏| 309/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.25.self_attn.o_proj.weight]Loading weights:  91%|████████▏| 309/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.25.self_attn.o_proj.weight]Loading weights:  92%|██████████ | 310/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.25.self_attn.q_proj.bias]Loading weights:  92%|██████████ | 310/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.25.self_attn.q_proj.bias]Loading weights:  92%|████████▎| 311/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.25.self_attn.q_proj.weight]Loading weights:  92%|████████▎| 311/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.25.self_attn.q_proj.weight]Loading weights:  92%|██████████▏| 312/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.25.self_attn.v_proj.bias]Loading weights:  92%|██████████▏| 312/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.25.self_attn.v_proj.bias]Loading weights:  93%|████████▎| 313/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.25.self_attn.v_proj.weight]Loading weights:  93%|████████▎| 313/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.25.self_attn.v_proj.weight]Loading weights:  93%|█████████▎| 314/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.input_layernorm.weight]Loading weights:  93%|█████████▎| 314/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.input_layernorm.weight]Loading weights:  93%|███████████▏| 315/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.mlp.down_proj.weight]Loading weights:  93%|███████████▏| 315/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.mlp.down_proj.weight]Loading weights:  93%|███████████▏| 316/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.mlp.gate_proj.weight]Loading weights:  93%|███████████▏| 316/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.mlp.gate_proj.weight]Loading weights:  94%|█████████████▏| 317/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.mlp.up_proj.weight]Loading weights:  94%|█████████████▏| 317/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.mlp.up_proj.weight]Loading weights:  94%|▉| 318/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.post_attention_layernorm.weight]Loading weights:  94%|▉| 318/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.post_attention_layernorm.weight]Loading weights:  94%|██████████▍| 319/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.self_attn.k_proj.bias]Loading weights:  94%|██████████▍| 319/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.self_attn.k_proj.bias]Loading weights:  95%|████████▌| 320/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.self_attn.k_proj.weight]Loading weights:  95%|████████▌| 320/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.self_attn.k_proj.weight]Loading weights:  95%|████████▌| 321/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.self_attn.o_proj.weight]Loading weights:  95%|████████▌| 321/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.self_attn.o_proj.weight]Loading weights:  95%|██████████▍| 322/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.self_attn.q_proj.bias]Loading weights:  95%|██████████▍| 322/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.self_attn.q_proj.bias]Loading weights:  96%|████████▌| 323/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.self_attn.q_proj.weight]Loading weights:  96%|████████▌| 323/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.self_attn.q_proj.weight]Loading weights:  96%|██████████▌| 324/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.self_attn.v_proj.bias]Loading weights:  96%|██████████▌| 324/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.self_attn.v_proj.bias]Loading weights:  96%|████████▋| 325/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.self_attn.v_proj.weight]Loading weights:  96%|████████▋| 325/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.26.self_attn.v_proj.weight]Loading weights:  96%|█████████▋| 326/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.input_layernorm.weight]Loading weights:  96%|█████████▋| 326/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.input_layernorm.weight]Loading weights:  97%|███████████▌| 327/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.mlp.down_proj.weight]Loading weights:  97%|███████████▌| 327/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.mlp.down_proj.weight]Loading weights:  97%|███████████▋| 328/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.mlp.gate_proj.weight]Loading weights:  97%|███████████▋| 328/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.mlp.gate_proj.weight]Loading weights:  97%|█████████████▋| 329/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.mlp.up_proj.weight]Loading weights:  97%|█████████████▋| 329/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.mlp.up_proj.weight]Loading weights:  98%|▉| 330/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.post_attention_layernorm.weight]Loading weights:  98%|▉| 330/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.post_attention_layernorm.weight]Loading weights:  98%|██████████▊| 331/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.self_attn.k_proj.bias]Loading weights:  98%|██████████▊| 331/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.self_attn.k_proj.bias]Loading weights:  98%|████████▊| 332/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.self_attn.k_proj.weight]Loading weights:  98%|████████▊| 332/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.self_attn.k_proj.weight]Loading weights:  99%|████████▊| 333/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.self_attn.o_proj.weight]Loading weights:  99%|████████▊| 333/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.self_attn.o_proj.weight]Loading weights:  99%|██████████▊| 334/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.self_attn.q_proj.bias]Loading weights:  99%|██████████▊| 334/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.self_attn.q_proj.bias]Loading weights:  99%|████████▉| 335/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.self_attn.q_proj.weight]Loading weights:  99%|████████▉| 335/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.self_attn.q_proj.weight]Loading weights:  99%|██████████▉| 336/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.self_attn.v_proj.bias]Loading weights:  99%|██████████▉| 336/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.self_attn.v_proj.bias]Loading weights: 100%|████████▉| 337/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.self_attn.v_proj.weight]Loading weights: 100%|████████▉| 337/338 [00:00<00:00, 1602.99it/s, Materializing param=model.layers.27.self_attn.v_proj.weight]Loading weights: 100%|███████████████████████████████| 338/338 [00:00<00:00, 1602.99it/s, Materializing param=model.norm.weight]Loading weights: 100%|███████████████████████████████| 338/338 [00:00<00:00, 1602.99it/s, Materializing param=model.norm.weight]Loading weights: 100%|███████████████████████████████| 338/338 [00:00<00:00, 1584.27it/s, Materializing param=model.norm.weight]
[2026-02-04 03:23:13,617] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:351] [PID:23602] Converting modules to torch.bfloat16
[2026-02-04 03:23:14,087] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:23602] Memory usage after model load 0.000GB ()
[2026-02-04 03:23:15,292] [WARNING] [torchao.<module>:39] [PID:23602] Skipping import of cpp extensions due to incompatible torch version 2.9.1+cu128 for torchao version 0.13.0
[2026-02-04 03:23:22,826] [INFO] [axolotl.train.save_initial_configs:406] [PID:23602] Pre-saving tokenizer to ./outputs...
[2026-02-04 03:23:23,029] [INFO] [axolotl.train.save_initial_configs:411] [PID:23602] Pre-saving model config to ./outputs...
[2026-02-04 03:23:23,032] [INFO] [axolotl.train.execute_training:207] [PID:23602] Starting trainer...
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.
[34m[1mwandb[0m: Currently logged in as: [33mguijinson[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: [38;5;178m⢿[0m Waiting for wandb.init()...
[Am[2K[34m[1mwandb[0m: [38;5;178m⣻[0m setting up run l5xo86c3 (0.1s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣽[0m setting up run l5xo86c3 (0.1s)
[Am[2K[34m[1mwandb[0m: Tracking run with wandb version 0.24.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/workspace/axolotl/wandb/run-20260204_032324-l5xo86c3[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mfallen-puddle-3[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/guijinson/FC-T2J[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/guijinson/FC-T2J/runs/l5xo86c3[0m
[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/
[34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
[34m[1mwandb[0m: [33mWARNING[0m Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.
[2026-02-04 03:23:27,200] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:23602] The Axolotl config has been saved to the WandB run under files.
  0%|                                                                                                  | 0/1751 [00:00<?, ?it/s]  0%|                                                                                       | 1/1751 [01:36<46:49:28, 96.32s/it]                                                                                                                                {'loss': '0.8204', 'grad_norm': '2.547', 'learning_rate': '0', 'ppl': '2.271', 'memory/max_active (GiB)': '69.89', 'memory/max_allocated (GiB)': '69.89', 'memory/device_reserved (GiB)': '112.1', 'tokens/train_per_sec_per_gpu': '87.66', 'tokens/total': 1834530, 'tokens/trainable': 671788, 'epoch': '0.001714'}
  0%|                                                                                       | 1/1751 [01:36<46:49:28, 96.32s/it]  0%|                                                                                       | 2/1751 [02:55<41:51:37, 86.16s/it]                                                                                                                                {'loss': '0.7941', 'grad_norm': '2.438', 'learning_rate': '2.299e-07', 'ppl': '2.213', 'memory/max_active (GiB)': '73.48', 'memory/max_allocated (GiB)': '73.48', 'memory/device_reserved (GiB)': '112.1', 'tokens/train_per_sec_per_gpu': '50.37', 'tokens/total': 3812934, 'tokens/trainable': 1390331, 'epoch': '0.003427'}
  0%|                                                                                       | 2/1751 [02:55<41:51:37, 86.16s/it]  0%|▏                                                                                      | 3/1751 [04:04<37:58:09, 78.20s/it]                                                                                                                                {'loss': '0.8036', 'grad_norm': '2.484', 'learning_rate': '4.598e-07', 'ppl': '2.233', 'memory/max_active (GiB)': '75.09', 'memory/max_allocated (GiB)': '75.09', 'memory/device_reserved (GiB)': '138.7', 'tokens/train_per_sec_per_gpu': '134.4', 'tokens/total': 5715100, 'tokens/trainable': 2090017, 'epoch': '0.005141'}
  0%|▏                                                                                      | 3/1751 [04:04<37:58:09, 78.20s/it]  0%|▏                                                                                      | 4/1751 [05:12<36:09:49, 74.52s/it]                                                                                                                                {'loss': '0.769', 'grad_norm': '2.375', 'learning_rate': '6.897e-07', 'ppl': '2.158', 'memory/max_active (GiB)': '76.74', 'memory/max_allocated (GiB)': '76.74', 'memory/device_reserved (GiB)': '124.1', 'tokens/train_per_sec_per_gpu': '158.6', 'tokens/total': 7674006, 'tokens/trainable': 2793873, 'epoch': '0.006854'}
  0%|▏                                                                                      | 4/1751 [05:12<36:09:49, 74.52s/it]  0%|▏                                                                                      | 5/1751 [06:19<34:47:31, 71.74s/it]                                                                                                                                {'loss': '0.7975', 'grad_norm': '2.453', 'learning_rate': '9.195e-07', 'ppl': '2.22', 'memory/max_active (GiB)': '75.56', 'memory/max_allocated (GiB)': '75.56', 'memory/device_reserved (GiB)': '124.1', 'tokens/train_per_sec_per_gpu': '59.59', 'tokens/total': 9640220, 'tokens/trainable': 3510346, 'epoch': '0.008568'}
  0%|▏                                                                                      | 5/1751 [06:19<34:47:31, 71.74s/it]  0%|▎                                                                                      | 6/1751 [07:23<33:28:21, 69.06s/it]                                                                                                                                {'loss': '0.8214', 'grad_norm': '2.484', 'learning_rate': '1.149e-06', 'ppl': '2.274', 'memory/max_active (GiB)': '76.28', 'memory/max_allocated (GiB)': '76.28', 'memory/device_reserved (GiB)': '124.1', 'tokens/train_per_sec_per_gpu': '162', 'tokens/total': 11548152, 'tokens/trainable': 4220081, 'epoch': '0.01028'}
  0%|▎                                                                                      | 6/1751 [07:23<33:28:21, 69.06s/it]  0%|▎                                                                                      | 7/1751 [08:31<33:13:53, 68.60s/it]                                                                                                                                {'loss': '0.7865', 'grad_norm': '2.344', 'learning_rate': '1.379e-06', 'ppl': '2.196', 'memory/max_active (GiB)': '75.34', 'memory/max_allocated (GiB)': '75.34', 'memory/device_reserved (GiB)': '124.1', 'tokens/train_per_sec_per_gpu': '43.42', 'tokens/total': 13498976, 'tokens/trainable': 4961268, 'epoch': '0.012'}
  0%|▎                                                                                      | 7/1751 [08:31<33:13:53, 68.60s/it]  0%|▍                                                                                      | 8/1751 [09:34<32:18:28, 66.73s/it]                                                                                                                                {'loss': '0.8021', 'grad_norm': '2.422', 'learning_rate': '1.609e-06', 'ppl': '2.23', 'memory/max_active (GiB)': '74.5', 'memory/max_allocated (GiB)': '74.5', 'memory/device_reserved (GiB)': '124.1', 'tokens/train_per_sec_per_gpu': '35.38', 'tokens/total': 15382640, 'tokens/trainable': 5646872, 'epoch': '0.01371'}
  0%|▍                                                                                      | 8/1751 [09:34<32:18:28, 66.73s/it]  1%|▍                                                                                      | 9/1751 [10:38<32:00:39, 66.15s/it]                                                                                                                                {'loss': '0.8016', 'grad_norm': '2.375', 'learning_rate': '1.839e-06', 'ppl': '2.229', 'memory/max_active (GiB)': '76.11', 'memory/max_allocated (GiB)': '76.11', 'memory/device_reserved (GiB)': '124.1', 'tokens/train_per_sec_per_gpu': '190.6', 'tokens/total': 17384096, 'tokens/trainable': 6378150, 'epoch': '0.01542'}
  1%|▍                                                                                      | 9/1751 [10:38<32:00:39, 66.15s/it]  1%|▍                                                                                     | 10/1751 [11:38<30:59:44, 64.09s/it]                                                                                                                                {'loss': '0.8232', 'grad_norm': '2.484', 'learning_rate': '2.069e-06', 'ppl': '2.278', 'memory/max_active (GiB)': '72.24', 'memory/max_allocated (GiB)': '72.24', 'memory/device_reserved (GiB)': '124.1', 'tokens/train_per_sec_per_gpu': '45.02', 'tokens/total': 19170262, 'tokens/trainable': 7049033, 'epoch': '0.01714'}
  1%|▍                                                                                     | 10/1751 [11:38<30:59:44, 64.09s/it]  1%|▌                                                                                     | 11/1751 [12:39<30:31:18, 63.15s/it]                                                                                                                                {'loss': '0.8254', 'grad_norm': '2.406', 'learning_rate': '2.299e-06', 'ppl': '2.283', 'memory/max_active (GiB)': '76.77', 'memory/max_allocated (GiB)': '76.77', 'memory/device_reserved (GiB)': '138.8', 'tokens/train_per_sec_per_gpu': '89.47', 'tokens/total': 21041714, 'tokens/trainable': 7736419, 'epoch': '0.01885'}
  1%|▌                                                                                     | 11/1751 [12:39<30:31:18, 63.15s/it]  1%|▌                                                                                     | 12/1751 [13:38<29:55:49, 61.96s/it]                                                                                                                                {'loss': '0.8339', 'grad_norm': '2.406', 'learning_rate': '2.529e-06', 'ppl': '2.302', 'memory/max_active (GiB)': '75.03', 'memory/max_allocated (GiB)': '75.03', 'memory/device_reserved (GiB)': '81.85', 'tokens/train_per_sec_per_gpu': '97.62', 'tokens/total': 22841428, 'tokens/trainable': 8441855, 'epoch': '0.02056'}
  1%|▌                                                                                     | 12/1751 [13:38<29:55:49, 61.96s/it]  1%|▋                                                                                     | 13/1751 [14:40<29:53:03, 61.90s/it]                                                                                                                                {'loss': '0.8217', 'grad_norm': '2.312', 'learning_rate': '2.759e-06', 'ppl': '2.274', 'memory/max_active (GiB)': '73.28', 'memory/max_allocated (GiB)': '73.28', 'memory/device_reserved (GiB)': '81.85', 'tokens/train_per_sec_per_gpu': '85.77', 'tokens/total': 24697532, 'tokens/trainable': 9146486, 'epoch': '0.02228'}
  1%|▋                                                                                     | 13/1751 [14:40<29:53:03, 61.90s/it]  1%|▋                                                                                     | 14/1751 [15:43<30:01:43, 62.24s/it]                                                                                                                                {'loss': '0.8174', 'grad_norm': '2.266', 'learning_rate': '2.989e-06', 'ppl': '2.265', 'memory/max_active (GiB)': '74.64', 'memory/max_allocated (GiB)': '74.64', 'memory/device_reserved (GiB)': '81.85', 'tokens/train_per_sec_per_gpu': '96', 'tokens/total': 26579190, 'tokens/trainable': 9856660, 'epoch': '0.02399'}
  1%|▋                                                                                     | 14/1751 [15:43<30:01:43, 62.24s/it]  1%|▋                                                                                     | 15/1751 [16:46<30:04:09, 62.36s/it]                                                                                                                                {'loss': '0.8005', 'grad_norm': '2.188', 'learning_rate': '3.218e-06', 'ppl': '2.227', 'memory/max_active (GiB)': '73.66', 'memory/max_allocated (GiB)': '73.66', 'memory/device_reserved (GiB)': '81.85', 'tokens/train_per_sec_per_gpu': '175.6', 'tokens/total': 28523402, 'tokens/trainable': 10582044, 'epoch': '0.0257'}
  1%|▋                                                                                     | 15/1751 [16:46<30:04:09, 62.36s/it]  1%|▊                                                                                     | 16/1751 [17:49<30:15:10, 62.77s/it]                                                                                                                                {'loss': '0.7815', 'grad_norm': '2.047', 'learning_rate': '3.448e-06', 'ppl': '2.185', 'memory/max_active (GiB)': '72.43', 'memory/max_allocated (GiB)': '72.43', 'memory/device_reserved (GiB)': '81.85', 'tokens/train_per_sec_per_gpu': '100.2', 'tokens/total': 30502776, 'tokens/trainable': 11332779, 'epoch': '0.02742'}
  1%|▊                                                                                     | 16/1751 [17:49<30:15:10, 62.77s/it]  1%|▊                                                                                     | 17/1751 [18:49<29:51:57, 62.01s/it]                                                                                                                                {'loss': '0.8081', 'grad_norm': '2.172', 'learning_rate': '3.678e-06', 'ppl': '2.244', 'memory/max_active (GiB)': '73.44', 'memory/max_allocated (GiB)': '73.44', 'memory/device_reserved (GiB)': '87.4', 'tokens/train_per_sec_per_gpu': '110.4', 'tokens/total': 32391782, 'tokens/trainable': 12028023, 'epoch': '0.02913'}
  1%|▊                                                                                     | 17/1751 [18:50<29:51:57, 62.01s/it]  1%|▉                                                                                     | 18/1751 [19:55<30:18:04, 62.95s/it]                                                                                                                                {'loss': '0.7662', 'grad_norm': '2', 'learning_rate': '3.908e-06', 'ppl': '2.151', 'memory/max_active (GiB)': '75.09', 'memory/max_allocated (GiB)': '75.09', 'memory/device_reserved (GiB)': '87.4', 'tokens/train_per_sec_per_gpu': '87.8', 'tokens/total': 34363064, 'tokens/trainable': 12766257, 'epoch': '0.03084'}
  1%|▉                                                                                     | 18/1751 [19:55<30:18:04, 62.95s/it]  1%|▉                                                                                     | 19/1751 [20:57<30:09:13, 62.68s/it]                                                                                                                                {'loss': '0.8027', 'grad_norm': '2.031', 'learning_rate': '4.138e-06', 'ppl': '2.232', 'memory/max_active (GiB)': '73.32', 'memory/max_allocated (GiB)': '73.32', 'memory/device_reserved (GiB)': '87.4', 'tokens/train_per_sec_per_gpu': '50.14', 'tokens/total': 36288620, 'tokens/trainable': 13469371, 'epoch': '0.03256'}
  1%|▉                                                                                     | 19/1751 [20:57<30:09:13, 62.68s/it]  1%|▉                                                                                     | 20/1751 [21:58<29:55:41, 62.24s/it]                                                                                                                                {'loss': '0.759', 'grad_norm': '1.906', 'learning_rate': '4.368e-06', 'ppl': '2.136', 'memory/max_active (GiB)': '73.85', 'memory/max_allocated (GiB)': '73.85', 'memory/device_reserved (GiB)': '87.4', 'tokens/train_per_sec_per_gpu': '229.5', 'tokens/total': 38160128, 'tokens/trainable': 14156393, 'epoch': '0.03427'}
  1%|▉                                                                                     | 20/1751 [21:58<29:55:41, 62.24s/it]  1%|█                                                                                     | 21/1751 [22:58<29:35:09, 61.57s/it]                                                                                                                                {'loss': '0.7921', 'grad_norm': '1.93', 'learning_rate': '4.598e-06', 'ppl': '2.208', 'memory/max_active (GiB)': '70.08', 'memory/max_allocated (GiB)': '70.08', 'memory/device_reserved (GiB)': '87.4', 'tokens/train_per_sec_per_gpu': '87.53', 'tokens/total': 39978416, 'tokens/trainable': 14801020, 'epoch': '0.03599'}
  1%|█                                                                                     | 21/1751 [22:58<29:35:09, 61.57s/it]  1%|█                                                                                     | 22/1751 [24:02<29:59:18, 62.44s/it]                                                                                                                                {'loss': '0.8098', 'grad_norm': '1.773', 'learning_rate': '4.828e-06', 'ppl': '2.248', 'memory/max_active (GiB)': '77.55', 'memory/max_allocated (GiB)': '77.55', 'memory/device_reserved (GiB)': '87.4', 'tokens/train_per_sec_per_gpu': '17.75', 'tokens/total': 41954892, 'tokens/trainable': 15520316, 'epoch': '0.0377'}
  1%|█                                                                                     | 22/1751 [24:02<29:59:18, 62.44s/it]  1%|█▏                                                                                    | 23/1751 [25:04<29:48:15, 62.09s/it]                                                                                                                                {'loss': '0.7599', 'grad_norm': '1.664', 'learning_rate': '5.057e-06', 'ppl': '2.138', 'memory/max_active (GiB)': '76.24', 'memory/max_allocated (GiB)': '76.24', 'memory/device_reserved (GiB)': '87.4', 'tokens/train_per_sec_per_gpu': '47.45', 'tokens/total': 43843760, 'tokens/trainable': 16190402, 'epoch': '0.03941'}
  1%|█▏                                                                                    | 23/1751 [25:04<29:48:15, 62.09s/it]  1%|█▏                                                                                    | 24/1751 [26:09<30:19:11, 63.20s/it]                                                                                                                                {'loss': '0.731', 'grad_norm': '1.555', 'learning_rate': '5.287e-06', 'ppl': '2.077', 'memory/max_active (GiB)': '72.8', 'memory/max_allocated (GiB)': '72.8', 'memory/device_reserved (GiB)': '87.4', 'tokens/train_per_sec_per_gpu': '53.98', 'tokens/total': 45850772, 'tokens/trainable': 16967408, 'epoch': '0.04113'}
  1%|█▏                                                                                    | 24/1751 [26:09<30:19:11, 63.20s/it]  1%|█▏                                                                                    | 25/1751 [27:12<30:11:49, 62.98s/it]                                                                                                                                {'loss': '0.7891', 'grad_norm': '1.586', 'learning_rate': '5.517e-06', 'ppl': '2.201', 'memory/max_active (GiB)': '75.23', 'memory/max_allocated (GiB)': '75.23', 'memory/device_reserved (GiB)': '88.8', 'tokens/train_per_sec_per_gpu': '71.35', 'tokens/total': 47771704, 'tokens/trainable': 17685610, 'epoch': '0.04284'}
  1%|█▏                                                                                    | 25/1751 [27:12<30:11:49, 62.98s/it]  1%|█▎                                                                                    | 26/1751 [28:12<29:45:37, 62.11s/it]                                                                                                                                {'loss': '0.802', 'grad_norm': '1.547', 'learning_rate': '5.747e-06', 'ppl': '2.23', 'memory/max_active (GiB)': '72.68', 'memory/max_allocated (GiB)': '72.68', 'memory/device_reserved (GiB)': '88.8', 'tokens/train_per_sec_per_gpu': '184.8', 'tokens/total': 49653912, 'tokens/trainable': 18396226, 'epoch': '0.04455'}
  1%|█▎                                                                                    | 26/1751 [28:12<29:45:37, 62.11s/it]  2%|█▎                                                                                    | 27/1751 [29:16<29:57:58, 62.57s/it]                                                                                                                                {'loss': '0.7437', 'grad_norm': '1.383', 'learning_rate': '5.977e-06', 'ppl': '2.104', 'memory/max_active (GiB)': '74.09', 'memory/max_allocated (GiB)': '74.09', 'memory/device_reserved (GiB)': '88.8', 'tokens/train_per_sec_per_gpu': '86.33', 'tokens/total': 51655384, 'tokens/trainable': 19143612, 'epoch': '0.04627'}
  2%|█▎                                                                                    | 27/1751 [29:16<29:57:58, 62.57s/it]  2%|█▍                                                                                    | 28/1751 [30:19<29:59:50, 62.68s/it]                                                                                                                                {'loss': '0.7873', 'grad_norm': '1.414', 'learning_rate': '6.207e-06', 'ppl': '2.197', 'memory/max_active (GiB)': '74.1', 'memory/max_allocated (GiB)': '74.1', 'memory/device_reserved (GiB)': '88.8', 'tokens/train_per_sec_per_gpu': '81.41', 'tokens/total': 53622576, 'tokens/trainable': 19854062, 'epoch': '0.04798'}
  2%|█▍                                                                                    | 28/1751 [30:19<29:59:50, 62.68s/it]  2%|█▍                                                                                    | 29/1751 [31:23<30:11:59, 63.14s/it]                                                                                                                                {'loss': '0.7349', 'grad_norm': '1.297', 'learning_rate': '6.437e-06', 'ppl': '2.085', 'memory/max_active (GiB)': '78', 'memory/max_allocated (GiB)': '78', 'memory/device_reserved (GiB)': '88.8', 'tokens/train_per_sec_per_gpu': '151.2', 'tokens/total': 55639160, 'tokens/trainable': 20612934, 'epoch': '0.04969'}
  2%|█▍                                                                                    | 29/1751 [31:23<30:11:59, 63.14s/it]  2%|█▍                                                                                    | 30/1751 [32:22<29:41:30, 62.11s/it]                                                                                                                                {'loss': '0.7467', 'grad_norm': '1.305', 'learning_rate': '6.667e-06', 'ppl': '2.11', 'memory/max_active (GiB)': '68.9', 'memory/max_allocated (GiB)': '68.9', 'memory/device_reserved (GiB)': '88.8', 'tokens/train_per_sec_per_gpu': '152.4', 'tokens/total': 57444944, 'tokens/trainable': 21251388, 'epoch': '0.05141'}
  2%|█▍                                                                                    | 30/1751 [32:22<29:41:30, 62.11s/it]  2%|█▌                                                                                    | 31/1751 [33:24<29:37:22, 62.00s/it]                                                                                                                                {'loss': '0.7703', 'grad_norm': '1.219', 'learning_rate': '6.897e-06', 'ppl': '2.16', 'memory/max_active (GiB)': '75.41', 'memory/max_allocated (GiB)': '75.41', 'memory/device_reserved (GiB)': '88.8', 'tokens/train_per_sec_per_gpu': '76.28', 'tokens/total': 59329272, 'tokens/trainable': 21964028, 'epoch': '0.05312'}
  2%|█▌                                                                                    | 31/1751 [33:24<29:37:22, 62.00s/it]  2%|█▌                                                                                    | 32/1751 [34:26<29:37:19, 62.04s/it]                                                                                                                                {'loss': '0.7068', 'grad_norm': '1.102', 'learning_rate': '7.126e-06', 'ppl': '2.027', 'memory/max_active (GiB)': '71.96', 'memory/max_allocated (GiB)': '71.96', 'memory/device_reserved (GiB)': '88.8', 'tokens/train_per_sec_per_gpu': '211.5', 'tokens/total': 61280488, 'tokens/trainable': 22693880, 'epoch': '0.05483'}
  2%|█▌                                                                                    | 32/1751 [34:26<29:37:19, 62.04s/it]  2%|█▌                                                                                    | 33/1751 [35:28<29:35:39, 62.01s/it]                                                                                                                                {'loss': '0.771', 'grad_norm': '1.133', 'learning_rate': '7.356e-06', 'ppl': '2.162', 'memory/max_active (GiB)': '72.04', 'memory/max_allocated (GiB)': '72.04', 'memory/device_reserved (GiB)': '88.8', 'tokens/train_per_sec_per_gpu': '64.11', 'tokens/total': 63146424, 'tokens/trainable': 23371438, 'epoch': '0.05655'}
  2%|█▌                                                                                    | 33/1751 [35:28<29:35:39, 62.01s/it]  2%|█▋                                                                                    | 34/1751 [36:29<29:26:33, 61.73s/it]                                                                                                                                {'loss': '0.7335', 'grad_norm': '0.9961', 'learning_rate': '7.586e-06', 'ppl': '2.082', 'memory/max_active (GiB)': '71.32', 'memory/max_allocated (GiB)': '71.32', 'memory/device_reserved (GiB)': '88.8', 'tokens/train_per_sec_per_gpu': '52.22', 'tokens/total': 65025712, 'tokens/trainable': 24095372, 'epoch': '0.05826'}
  2%|█▋                                                                                    | 34/1751 [36:29<29:26:33, 61.73s/it]  2%|█▋                                                                                    | 35/1751 [37:30<29:18:41, 61.49s/it]                                                                                                                                {'loss': '0.7167', 'grad_norm': '0.9531', 'learning_rate': '7.816e-06', 'ppl': '2.048', 'memory/max_active (GiB)': '74.41', 'memory/max_allocated (GiB)': '74.41', 'memory/device_reserved (GiB)': '88.8', 'tokens/train_per_sec_per_gpu': '78.68', 'tokens/total': 66931924, 'tokens/trainable': 24778120, 'epoch': '0.05998'}
  2%|█▋                                                                                    | 35/1751 [37:30<29:18:41, 61.49s/it]  2%|█▊                                                                                    | 36/1751 [38:30<29:01:10, 60.92s/it]                                                                                                                                {'loss': '0.778', 'grad_norm': '0.9258', 'learning_rate': '8.046e-06', 'ppl': '2.177', 'memory/max_active (GiB)': '72.12', 'memory/max_allocated (GiB)': '72.12', 'memory/device_reserved (GiB)': '88.8', 'tokens/train_per_sec_per_gpu': '84.81', 'tokens/total': 68792224, 'tokens/trainable': 25453048, 'epoch': '0.06169'}
  2%|█▊                                                                                    | 36/1751 [38:30<29:01:10, 60.92s/it]  2%|█▊                                                                                    | 37/1751 [39:30<28:48:58, 60.52s/it]                                                                                                                                {'loss': '0.7562', 'grad_norm': '0.8711', 'learning_rate': '8.276e-06', 'ppl': '2.13', 'memory/max_active (GiB)': '74.95', 'memory/max_allocated (GiB)': '74.95', 'memory/device_reserved (GiB)': '90.21', 'tokens/train_per_sec_per_gpu': '42.07', 'tokens/total': 70691112, 'tokens/trainable': 26120194, 'epoch': '0.0634'}
  2%|█▊                                                                                    | 37/1751 [39:30<28:48:58, 60.52s/it]  2%|█▊                                                                                    | 38/1751 [40:28<28:34:10, 60.04s/it]                                                                                                                                {'loss': '0.7529', 'grad_norm': '0.8164', 'learning_rate': '8.506e-06', 'ppl': '2.123', 'memory/max_active (GiB)': '71.95', 'memory/max_allocated (GiB)': '71.95', 'memory/device_reserved (GiB)': '90.21', 'tokens/train_per_sec_per_gpu': '91.53', 'tokens/total': 72533640, 'tokens/trainable': 26808220, 'epoch': '0.06512'}
  2%|█▊                                                                                    | 38/1751 [40:28<28:34:10, 60.04s/it]  2%|█▉                                                                                    | 39/1751 [41:31<28:53:22, 60.75s/it]                                                                                                                                {'loss': '0.7044', 'grad_norm': '0.7461', 'learning_rate': '8.736e-06', 'ppl': '2.023', 'memory/max_active (GiB)': '73.34', 'memory/max_allocated (GiB)': '73.34', 'memory/device_reserved (GiB)': '90.21', 'tokens/train_per_sec_per_gpu': '119', 'tokens/total': 74445152, 'tokens/trainable': 27542280, 'epoch': '0.06683'}
  2%|█▉                                                                                    | 39/1751 [41:31<28:53:22, 60.75s/it]  2%|█▉                                                                                    | 40/1751 [42:31<28:46:59, 60.56s/it]                                                                                                                                {'loss': '0.7106', 'grad_norm': '0.7188', 'learning_rate': '8.966e-06', 'ppl': '2.035', 'memory/max_active (GiB)': '75.23', 'memory/max_allocated (GiB)': '75.23', 'memory/device_reserved (GiB)': '90.21', 'tokens/train_per_sec_per_gpu': '76.05', 'tokens/total': 76300240, 'tokens/trainable': 28201420, 'epoch': '0.06854'}
  2%|█▉                                                                                    | 40/1751 [42:31<28:46:59, 60.56s/it]  2%|██                                                                                    | 41/1751 [43:34<29:08:18, 61.34s/it]                                                                                                                                {'loss': '0.7095', 'grad_norm': '0.6719', 'learning_rate': '9.195e-06', 'ppl': '2.033', 'memory/max_active (GiB)': '76.66', 'memory/max_allocated (GiB)': '76.66', 'memory/device_reserved (GiB)': '90.21', 'tokens/train_per_sec_per_gpu': '68.79', 'tokens/total': 78209464, 'tokens/trainable': 28921776, 'epoch': '0.07026'}
  2%|██                                                                                    | 41/1751 [43:34<29:08:18, 61.34s/it]  2%|██                                                                                    | 42/1751 [44:37<29:22:09, 61.87s/it]                                                                                                                                {'loss': '0.6223', 'grad_norm': '0.6133', 'learning_rate': '9.425e-06', 'ppl': '1.863', 'memory/max_active (GiB)': '77.24', 'memory/max_allocated (GiB)': '77.24', 'memory/device_reserved (GiB)': '90.21', 'tokens/train_per_sec_per_gpu': '84.49', 'tokens/total': 80167216, 'tokens/trainable': 29643618, 'epoch': '0.07197'}
  2%|██                                                                                    | 42/1751 [44:37<29:22:09, 61.87s/it]  2%|██                                                                                    | 43/1751 [45:40<29:26:36, 62.06s/it]                                                                                                                                {'loss': '0.6667', 'grad_norm': '0.6016', 'learning_rate': '9.655e-06', 'ppl': '1.948', 'memory/max_active (GiB)': '77.8', 'memory/max_allocated (GiB)': '77.8', 'memory/device_reserved (GiB)': '91.62', 'tokens/train_per_sec_per_gpu': '36.58', 'tokens/total': 82139048, 'tokens/trainable': 30370160, 'epoch': '0.07368'}
  2%|██                                                                                    | 43/1751 [45:40<29:26:36, 62.06s/it]  3%|██▏                                                                                   | 44/1751 [46:44<29:41:26, 62.62s/it]                                                                                                                                {'loss': '0.6712', 'grad_norm': '0.6016', 'learning_rate': '9.885e-06', 'ppl': '1.956', 'memory/max_active (GiB)': '73.95', 'memory/max_allocated (GiB)': '73.95', 'memory/device_reserved (GiB)': '91.62', 'tokens/train_per_sec_per_gpu': '75.09', 'tokens/total': 84154208, 'tokens/trainable': 31109848, 'epoch': '0.0754'}
  3%|██▏                                                                                   | 44/1751 [46:44<29:41:26, 62.62s/it]  3%|██▏                                                                                   | 45/1751 [47:40<28:43:55, 60.63s/it]                                                                                                                                {'loss': '0.6693', 'grad_norm': '0.6094', 'learning_rate': '1.011e-05', 'ppl': '1.953', 'memory/max_active (GiB)': '70.72', 'memory/max_allocated (GiB)': '70.72', 'memory/device_reserved (GiB)': '91.62', 'tokens/train_per_sec_per_gpu': '45.23', 'tokens/total': 85882736, 'tokens/trainable': 31745420, 'epoch': '0.07711'}
  3%|██▏                                                                                   | 45/1751 [47:40<28:43:55, 60.63s/it]  3%|██▎                                                                                   | 46/1751 [48:40<28:38:04, 60.46s/it]                                                                                                                                {'loss': '0.6886', 'grad_norm': '0.5586', 'learning_rate': '1.034e-05', 'ppl': '1.991', 'memory/max_active (GiB)': '70.98', 'memory/max_allocated (GiB)': '70.98', 'memory/device_reserved (GiB)': '91.62', 'tokens/train_per_sec_per_gpu': '83.22', 'tokens/total': 87756096, 'tokens/trainable': 32458464, 'epoch': '0.07883'}
  3%|██▎                                                                                   | 46/1751 [48:40<28:38:04, 60.46s/it]  3%|██▎                                                                                   | 47/1751 [49:38<28:16:08, 59.72s/it]                                                                                                                                {'loss': '0.6424', 'grad_norm': '0.543', 'learning_rate': '1.057e-05', 'ppl': '1.901', 'memory/max_active (GiB)': '75.89', 'memory/max_allocated (GiB)': '75.89', 'memory/device_reserved (GiB)': '91.62', 'tokens/train_per_sec_per_gpu': '73.33', 'tokens/total': 89564656, 'tokens/trainable': 33112438, 'epoch': '0.08054'}
  3%|██▎                                                                                   | 47/1751 [49:38<28:16:08, 59.72s/it]  3%|██▎                                                                                   | 48/1751 [50:38<28:19:00, 59.86s/it]                                                                                                                                {'loss': '0.6832', 'grad_norm': '0.5469', 'learning_rate': '1.08e-05', 'ppl': '1.98', 'memory/max_active (GiB)': '75.77', 'memory/max_allocated (GiB)': '75.77', 'memory/device_reserved (GiB)': '91.62', 'tokens/train_per_sec_per_gpu': '59.16', 'tokens/total': 91471760, 'tokens/trainable': 33784004, 'epoch': '0.08225'}
  3%|██▎                                                                                   | 48/1751 [50:38<28:19:00, 59.86s/it]  3%|██▍                                                                                   | 49/1751 [51:38<28:23:10, 60.04s/it]                                                                                                                                {'loss': '0.7143', 'grad_norm': '0.5391', 'learning_rate': '1.103e-05', 'ppl': '2.043', 'memory/max_active (GiB)': '68.29', 'memory/max_allocated (GiB)': '68.29', 'memory/device_reserved (GiB)': '91.62', 'tokens/train_per_sec_per_gpu': '95.35', 'tokens/total': 93368856, 'tokens/trainable': 34498816, 'epoch': '0.08397'}
  3%|██▍                                                                                   | 49/1751 [51:38<28:23:10, 60.04s/it]  3%|██▍                                                                                   | 50/1751 [52:41<28:47:08, 60.92s/it]                                                                                                                                {'loss': '0.6475', 'grad_norm': '0.4805', 'learning_rate': '1.126e-05', 'ppl': '1.911', 'memory/max_active (GiB)': '72.34', 'memory/max_allocated (GiB)': '72.34', 'memory/device_reserved (GiB)': '91.62', 'tokens/train_per_sec_per_gpu': '71.48', 'tokens/total': 95380976, 'tokens/trainable': 35228440, 'epoch': '0.08568'}
  3%|██▍                                                                                   | 50/1751 [52:41<28:47:08, 60.92s/it]  3%|██▌                                                                                   | 51/1751 [53:43<28:55:24, 61.25s/it]                                                                                                                                {'loss': '0.636', 'grad_norm': '0.4473', 'learning_rate': '1.149e-05', 'ppl': '1.889', 'memory/max_active (GiB)': '74.63', 'memory/max_allocated (GiB)': '74.63', 'memory/device_reserved (GiB)': '91.62', 'tokens/train_per_sec_per_gpu': '251', 'tokens/total': 97323536, 'tokens/trainable': 35945972, 'epoch': '0.08739'}
  3%|██▌                                                                                   | 51/1751 [53:43<28:55:24, 61.25s/it]  3%|██▌                                                                                   | 52/1751 [54:48<29:21:56, 62.22s/it]                                                                                                                                {'loss': '0.6288', 'grad_norm': '0.4121', 'learning_rate': '1.172e-05', 'ppl': '1.875', 'memory/max_active (GiB)': '75.62', 'memory/max_allocated (GiB)': '75.62', 'memory/device_reserved (GiB)': '91.62', 'tokens/train_per_sec_per_gpu': '107.8', 'tokens/total': 99399136, 'tokens/trainable': 36706784, 'epoch': '0.08911'}
  3%|██▌                                                                                   | 52/1751 [54:48<29:21:56, 62.22s/it]  3%|██▌                                                                                   | 53/1751 [55:51<29:30:25, 62.56s/it]                                                                                                                                {'loss': '0.6098', 'grad_norm': '0.4004', 'learning_rate': '1.195e-05', 'ppl': '1.84', 'memory/max_active (GiB)': '74.91', 'memory/max_allocated (GiB)': '74.91', 'memory/device_reserved (GiB)': '91.62', 'tokens/train_per_sec_per_gpu': '37.19', 'tokens/total': 101420968, 'tokens/trainable': 37478376, 'epoch': '0.09082'}
  3%|██▌                                                                                   | 53/1751 [55:51<29:30:25, 62.56s/it]  3%|██▋                                                                                   | 54/1751 [56:52<29:14:01, 62.02s/it]                                                                                                                                {'loss': '0.6554', 'grad_norm': '0.4238', 'learning_rate': '1.218e-05', 'ppl': '1.926', 'memory/max_active (GiB)': '71.72', 'memory/max_allocated (GiB)': '71.72', 'memory/device_reserved (GiB)': '91.62', 'tokens/train_per_sec_per_gpu': '53.8', 'tokens/total': 103333040, 'tokens/trainable': 38183056, 'epoch': '0.09253'}
  3%|██▋                                                                                   | 54/1751 [56:52<29:14:01, 62.02s/it]  3%|██▋                                                                                   | 55/1751 [57:52<28:53:22, 61.32s/it]                                                                                                                                {'loss': '0.676', 'grad_norm': '0.4375', 'learning_rate': '1.241e-05', 'ppl': '1.966', 'memory/max_active (GiB)': '74.42', 'memory/max_allocated (GiB)': '74.42', 'memory/device_reserved (GiB)': '91.62', 'tokens/train_per_sec_per_gpu': '180', 'tokens/total': 105183200, 'tokens/trainable': 38833828, 'epoch': '0.09425'}
  3%|██▋                                                                                   | 55/1751 [57:52<28:53:22, 61.32s/it]  3%|██▊                                                                                   | 56/1751 [58:55<29:07:22, 61.85s/it]                                                                                                                                {'loss': '0.631', 'grad_norm': '0.3906', 'learning_rate': '1.264e-05', 'ppl': '1.88', 'memory/max_active (GiB)': '75.68', 'memory/max_allocated (GiB)': '75.68', 'memory/device_reserved (GiB)': '91.62', 'tokens/train_per_sec_per_gpu': '155.9', 'tokens/total': 107213576, 'tokens/trainable': 39613108, 'epoch': '0.09596'}
  3%|██▊                                                                                   | 56/1751 [58:55<29:07:22, 61.85s/it]  3%|██▊                                                                                   | 57/1751 [59:56<28:57:25, 61.54s/it]                                                                                                                                {'loss': '0.6584', 'grad_norm': '0.3945', 'learning_rate': '1.287e-05', 'ppl': '1.932', 'memory/max_active (GiB)': '75.68', 'memory/max_allocated (GiB)': '75.68', 'memory/device_reserved (GiB)': '91.62', 'tokens/train_per_sec_per_gpu': '197.1', 'tokens/total': 109112864, 'tokens/trainable': 40300148, 'epoch': '0.09767'}
  3%|██▊                                                                                   | 57/1751 [59:56<28:57:25, 61.54s/it]  3%|██▊                                                                                 | 58/1751 [1:00:57<28:56:12, 61.53s/it]                                                                                                                                {'loss': '0.6236', 'grad_norm': '0.3691', 'learning_rate': '1.31e-05', 'ppl': '1.866', 'memory/max_active (GiB)': '75.66', 'memory/max_allocated (GiB)': '75.66', 'memory/device_reserved (GiB)': '91.62', 'tokens/train_per_sec_per_gpu': '80.95', 'tokens/total': 111066544, 'tokens/trainable': 41024076, 'epoch': '0.09939'}
  3%|██▊                                                                                 | 58/1751 [1:00:57<28:56:12, 61.53s/it]  3%|██▊                                                                                 | 59/1751 [1:01:56<28:29:30, 60.62s/it]                                                                                                                                {'loss': '0.628', 'grad_norm': '0.3672', 'learning_rate': '1.333e-05', 'ppl': '1.874', 'memory/max_active (GiB)': '74.19', 'memory/max_allocated (GiB)': '74.19', 'memory/device_reserved (GiB)': '91.62', 'tokens/train_per_sec_per_gpu': '61.96', 'tokens/total': 112855568, 'tokens/trainable': 41704664, 'epoch': '0.1011'}
  3%|██▊                                                                                 | 59/1751 [1:01:56<28:29:30, 60.62s/it]  3%|██▉                                                                                 | 60/1751 [1:02:55<28:17:08, 60.22s/it]                                                                                                                                {'loss': '0.6274', 'grad_norm': '0.373', 'learning_rate': '1.356e-05', 'ppl': '1.873', 'memory/max_active (GiB)': '71.27', 'memory/max_allocated (GiB)': '71.27', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '65.88', 'tokens/total': 114666024, 'tokens/trainable': 42401496, 'epoch': '0.1028'}
  3%|██▉                                                                                 | 60/1751 [1:02:55<28:17:08, 60.22s/it]  3%|██▉                                                                                 | 61/1751 [1:03:55<28:14:37, 60.16s/it]                                                                                                                                {'loss': '0.6274', 'grad_norm': '0.3379', 'learning_rate': '1.379e-05', 'ppl': '1.873', 'memory/max_active (GiB)': '74.56', 'memory/max_allocated (GiB)': '74.56', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '155.3', 'tokens/total': 116532048, 'tokens/trainable': 43100796, 'epoch': '0.1045'}
  3%|██▉                                                                                 | 61/1751 [1:03:55<28:14:37, 60.16s/it]  4%|██▉                                                                                 | 62/1751 [1:04:57<28:34:03, 60.89s/it]                                                                                                                                {'loss': '0.6346', 'grad_norm': '0.3867', 'learning_rate': '1.402e-05', 'ppl': '1.886', 'memory/max_active (GiB)': '75.3', 'memory/max_allocated (GiB)': '75.3', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '202.1', 'tokens/total': 118505384, 'tokens/trainable': 43826752, 'epoch': '0.1062'}
  4%|██▉                                                                                 | 62/1751 [1:04:57<28:34:03, 60.89s/it]  4%|███                                                                                 | 63/1751 [1:05:58<28:28:18, 60.72s/it]                                                                                                                                {'loss': '0.6402', 'grad_norm': '0.3477', 'learning_rate': '1.425e-05', 'ppl': '1.897', 'memory/max_active (GiB)': '73.11', 'memory/max_allocated (GiB)': '73.11', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '162.8', 'tokens/total': 120396024, 'tokens/trainable': 44552896, 'epoch': '0.108'}
  4%|███                                                                                 | 63/1751 [1:05:58<28:28:18, 60.72s/it]  4%|███                                                                                 | 64/1751 [1:06:59<28:33:37, 60.95s/it]                                                                                                                                {'loss': '0.588', 'grad_norm': '0.3027', 'learning_rate': '1.448e-05', 'ppl': '1.8', 'memory/max_active (GiB)': '73.53', 'memory/max_allocated (GiB)': '73.53', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '212.9', 'tokens/total': 122364504, 'tokens/trainable': 45306552, 'epoch': '0.1097'}
  4%|███                                                                                 | 64/1751 [1:06:59<28:33:37, 60.95s/it]  4%|███                                                                                 | 65/1751 [1:08:02<28:48:52, 61.53s/it]                                                                                                                                {'loss': '0.6226', 'grad_norm': '0.3477', 'learning_rate': '1.471e-05', 'ppl': '1.864', 'memory/max_active (GiB)': '77.28', 'memory/max_allocated (GiB)': '77.28', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '91.98', 'tokens/total': 124315864, 'tokens/trainable': 46040464, 'epoch': '0.1114'}
  4%|███                                                                                 | 65/1751 [1:08:02<28:48:52, 61.53s/it]  4%|███▏                                                                                | 66/1751 [1:09:03<28:40:18, 61.26s/it]                                                                                                                                {'loss': '0.621', 'grad_norm': '0.3262', 'learning_rate': '1.494e-05', 'ppl': '1.861', 'memory/max_active (GiB)': '76.53', 'memory/max_allocated (GiB)': '76.53', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '123.9', 'tokens/total': 126172712, 'tokens/trainable': 46723000, 'epoch': '0.1131'}
  4%|███▏                                                                                | 66/1751 [1:09:03<28:40:18, 61.26s/it]  4%|███▏                                                                                | 67/1751 [1:10:03<28:33:47, 61.06s/it]                                                                                                                                {'loss': '0.6208', 'grad_norm': '0.2969', 'learning_rate': '1.517e-05', 'ppl': '1.86', 'memory/max_active (GiB)': '72.11', 'memory/max_allocated (GiB)': '72.11', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '93.83', 'tokens/total': 128079680, 'tokens/trainable': 47423596, 'epoch': '0.1148'}
  4%|███▏                                                                                | 67/1751 [1:10:03<28:33:47, 61.06s/it]  4%|███▎                                                                                | 68/1751 [1:11:05<28:40:51, 61.35s/it]                                                                                                                                {'loss': '0.6117', 'grad_norm': '0.3027', 'learning_rate': '1.54e-05', 'ppl': '1.843', 'memory/max_active (GiB)': '75.33', 'memory/max_allocated (GiB)': '75.33', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '51.72', 'tokens/total': 130039968, 'tokens/trainable': 48177232, 'epoch': '0.1165'}
  4%|███▎                                                                                | 68/1751 [1:11:05<28:40:51, 61.35s/it]  4%|███▎                                                                                | 69/1751 [1:12:07<28:45:48, 61.56s/it]                                                                                                                                {'loss': '0.6166', 'grad_norm': '0.2949', 'learning_rate': '1.563e-05', 'ppl': '1.853', 'memory/max_active (GiB)': '76.76', 'memory/max_allocated (GiB)': '76.76', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '58.99', 'tokens/total': 131944448, 'tokens/trainable': 48879216, 'epoch': '0.1182'}
  4%|███▎                                                                                | 69/1751 [1:12:07<28:45:48, 61.56s/it]  4%|███▎                                                                                | 70/1751 [1:13:08<28:33:29, 61.16s/it]                                                                                                                                {'loss': '0.5929', 'grad_norm': '0.3047', 'learning_rate': '1.586e-05', 'ppl': '1.809', 'memory/max_active (GiB)': '74.79', 'memory/max_allocated (GiB)': '74.79', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '152.7', 'tokens/total': 133806208, 'tokens/trainable': 49559024, 'epoch': '0.12'}
  4%|███▎                                                                                | 70/1751 [1:13:08<28:33:29, 61.16s/it]  4%|███▍                                                                                | 71/1751 [1:14:11<28:47:45, 61.71s/it]                                                                                                                                {'loss': '0.5931', 'grad_norm': '0.2715', 'learning_rate': '1.609e-05', 'ppl': '1.81', 'memory/max_active (GiB)': '73.73', 'memory/max_allocated (GiB)': '73.73', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '83.94', 'tokens/total': 135741968, 'tokens/trainable': 50280100, 'epoch': '0.1217'}
  4%|███▍                                                                                | 71/1751 [1:14:11<28:47:45, 61.71s/it]  4%|███▍                                                                                | 72/1751 [1:15:11<28:38:26, 61.41s/it]                                                                                                                                {'loss': '0.611', 'grad_norm': '0.3066', 'learning_rate': '1.632e-05', 'ppl': '1.842', 'memory/max_active (GiB)': '72.59', 'memory/max_allocated (GiB)': '72.59', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '87.5', 'tokens/total': 137673456, 'tokens/trainable': 51000720, 'epoch': '0.1234'}
  4%|███▍                                                                                | 72/1751 [1:15:11<28:38:26, 61.41s/it]  4%|███▌                                                                                | 73/1751 [1:16:14<28:46:01, 61.72s/it]                                                                                                                                {'loss': '0.5611', 'grad_norm': '0.2754', 'learning_rate': '1.655e-05', 'ppl': '1.753', 'memory/max_active (GiB)': '76.93', 'memory/max_allocated (GiB)': '76.93', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '79.71', 'tokens/total': 139674096, 'tokens/trainable': 51726184, 'epoch': '0.1251'}
  4%|███▌                                                                                | 73/1751 [1:16:14<28:46:01, 61.72s/it]  4%|███▌                                                                                | 74/1751 [1:17:17<28:59:00, 62.22s/it]                                                                                                                                {'loss': '0.589', 'grad_norm': '0.252', 'learning_rate': '1.678e-05', 'ppl': '1.802', 'memory/max_active (GiB)': '72.05', 'memory/max_allocated (GiB)': '72.05', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '164.2', 'tokens/total': 141722400, 'tokens/trainable': 52454600, 'epoch': '0.1268'}
  4%|███▌                                                                                | 74/1751 [1:17:17<28:59:00, 62.22s/it]  4%|███▌                                                                                | 75/1751 [1:18:21<29:09:15, 62.62s/it]                                                                                                                                {'loss': '0.5722', 'grad_norm': '0.3125', 'learning_rate': '1.701e-05', 'ppl': '1.772', 'memory/max_active (GiB)': '77.5', 'memory/max_allocated (GiB)': '77.5', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '91.79', 'tokens/total': 143743792, 'tokens/trainable': 53200752, 'epoch': '0.1285'}
  4%|███▌                                                                                | 75/1751 [1:18:21<29:09:15, 62.62s/it]  4%|███▋                                                                                | 76/1751 [1:19:21<28:44:30, 61.77s/it]                                                                                                                                {'loss': '0.605', 'grad_norm': '0.2949', 'learning_rate': '1.724e-05', 'ppl': '1.831', 'memory/max_active (GiB)': '73.67', 'memory/max_allocated (GiB)': '73.67', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '54.13', 'tokens/total': 145618064, 'tokens/trainable': 53874048, 'epoch': '0.1302'}
  4%|███▋                                                                                | 76/1751 [1:19:21<28:44:30, 61.77s/it]  4%|███▋                                                                                | 77/1751 [1:20:21<28:29:35, 61.28s/it]                                                                                                                                {'loss': '0.5926', 'grad_norm': '0.3047', 'learning_rate': '1.747e-05', 'ppl': '1.809', 'memory/max_active (GiB)': '70.07', 'memory/max_allocated (GiB)': '70.07', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '82.3', 'tokens/total': 147514720, 'tokens/trainable': 54567976, 'epoch': '0.1319'}
  4%|███▋                                                                                | 77/1751 [1:20:21<28:29:35, 61.28s/it]  4%|███▋                                                                                | 78/1751 [1:21:18<27:58:42, 60.20s/it]                                                                                                                                {'loss': '0.5939', 'grad_norm': '0.2695', 'learning_rate': '1.77e-05', 'ppl': '1.811', 'memory/max_active (GiB)': '75.16', 'memory/max_allocated (GiB)': '75.16', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '185.5', 'tokens/total': 149302848, 'tokens/trainable': 55226360, 'epoch': '0.1337'}
  4%|███▋                                                                                | 78/1751 [1:21:18<27:58:42, 60.20s/it]  5%|███▊                                                                                | 79/1751 [1:22:20<28:11:59, 60.72s/it]                                                                                                                                {'loss': '0.569', 'grad_norm': '0.2656', 'learning_rate': '1.793e-05', 'ppl': '1.767', 'memory/max_active (GiB)': '76.62', 'memory/max_allocated (GiB)': '76.62', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '20.87', 'tokens/total': 151271536, 'tokens/trainable': 55955340, 'epoch': '0.1354'}
  5%|███▊                                                                                | 79/1751 [1:22:20<28:11:59, 60.72s/it]  5%|███▊                                                                                | 80/1751 [1:23:19<27:56:25, 60.20s/it]                                                                                                                                {'loss': '0.5873', 'grad_norm': '0.2578', 'learning_rate': '1.816e-05', 'ppl': '1.799', 'memory/max_active (GiB)': '72.79', 'memory/max_allocated (GiB)': '72.79', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '99.47', 'tokens/total': 153079920, 'tokens/trainable': 56620704, 'epoch': '0.1371'}
  5%|███▊                                                                                | 80/1751 [1:23:19<27:56:25, 60.20s/it]  5%|███▉                                                                                | 81/1751 [1:24:20<27:57:43, 60.28s/it]                                                                                                                                {'loss': '0.6041', 'grad_norm': '0.249', 'learning_rate': '1.839e-05', 'ppl': '1.83', 'memory/max_active (GiB)': '73.78', 'memory/max_allocated (GiB)': '73.78', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '41.01', 'tokens/total': 154979520, 'tokens/trainable': 57324184, 'epoch': '0.1388'}
  5%|███▉                                                                                | 81/1751 [1:24:20<27:57:43, 60.28s/it]  5%|███▉                                                                                | 82/1751 [1:25:20<27:59:35, 60.38s/it]                                                                                                                                {'loss': '0.593', 'grad_norm': '0.2471', 'learning_rate': '1.862e-05', 'ppl': '1.809', 'memory/max_active (GiB)': '76.63', 'memory/max_allocated (GiB)': '76.63', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '45.5', 'tokens/total': 156862768, 'tokens/trainable': 58022332, 'epoch': '0.1405'}
  5%|███▉                                                                                | 82/1751 [1:25:20<27:59:35, 60.38s/it]  5%|███▉                                                                                | 83/1751 [1:26:20<27:51:22, 60.12s/it]                                                                                                                                {'loss': '0.6267', 'grad_norm': '0.2598', 'learning_rate': '1.885e-05', 'ppl': '1.871', 'memory/max_active (GiB)': '73.38', 'memory/max_allocated (GiB)': '73.38', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '64.97', 'tokens/total': 158745120, 'tokens/trainable': 58701116, 'epoch': '0.1422'}
  5%|███▉                                                                                | 83/1751 [1:26:20<27:51:22, 60.12s/it]  5%|████                                                                                | 84/1751 [1:27:22<28:08:42, 60.78s/it]                                                                                                                                {'loss': '0.5868', 'grad_norm': '0.2373', 'learning_rate': '1.908e-05', 'ppl': '1.798', 'memory/max_active (GiB)': '77.04', 'memory/max_allocated (GiB)': '77.04', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '26.79', 'tokens/total': 160753152, 'tokens/trainable': 59453312, 'epoch': '0.1439'}
  5%|████                                                                                | 84/1751 [1:27:22<28:08:42, 60.78s/it]  5%|████                                                                                | 85/1751 [1:28:22<27:59:06, 60.47s/it]                                                                                                                                {'loss': '0.6139', 'grad_norm': '0.2812', 'learning_rate': '1.931e-05', 'ppl': '1.848', 'memory/max_active (GiB)': '75.68', 'memory/max_allocated (GiB)': '75.68', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '39.91', 'tokens/total': 162620640, 'tokens/trainable': 60155104, 'epoch': '0.1457'}
  5%|████                                                                                | 85/1751 [1:28:22<27:59:06, 60.47s/it]  5%|████▏                                                                               | 86/1751 [1:29:22<27:50:57, 60.21s/it]                                                                                                                                {'loss': '0.5852', 'grad_norm': '0.2773', 'learning_rate': '1.954e-05', 'ppl': '1.795', 'memory/max_active (GiB)': '73.79', 'memory/max_allocated (GiB)': '73.79', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '100', 'tokens/total': 164510736, 'tokens/trainable': 60849736, 'epoch': '0.1474'}
  5%|████▏                                                                               | 86/1751 [1:29:22<27:50:57, 60.21s/it]  5%|████▏                                                                               | 87/1751 [1:30:21<27:39:50, 59.85s/it]                                                                                                                                {'loss': '0.6158', 'grad_norm': '0.2656', 'learning_rate': '1.977e-05', 'ppl': '1.851', 'memory/max_active (GiB)': '74.43', 'memory/max_allocated (GiB)': '74.43', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '111.1', 'tokens/total': 166356144, 'tokens/trainable': 61525336, 'epoch': '0.1491'}
  5%|████▏                                                                               | 87/1751 [1:30:21<27:39:50, 59.85s/it]  5%|████▏                                                                               | 88/1751 [1:31:23<27:57:28, 60.52s/it]                                                                                                                                {'loss': '0.5928', 'grad_norm': '0.249', 'learning_rate': '2e-05', 'ppl': '1.809', 'memory/max_active (GiB)': '74.94', 'memory/max_allocated (GiB)': '74.94', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '115.9', 'tokens/total': 168322688, 'tokens/trainable': 62221964, 'epoch': '0.1508'}
  5%|████▏                                                                               | 88/1751 [1:31:23<27:57:28, 60.52s/it]  5%|████▎                                                                               | 89/1751 [1:32:25<28:10:51, 61.04s/it]                                                                                                                                {'loss': '0.5918', 'grad_norm': '0.25', 'learning_rate': '2e-05', 'ppl': '1.807', 'memory/max_active (GiB)': '73.91', 'memory/max_allocated (GiB)': '73.91', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '77.34', 'tokens/total': 170283696, 'tokens/trainable': 62970592, 'epoch': '0.1525'}
  5%|████▎                                                                               | 89/1751 [1:32:25<28:10:51, 61.04s/it]  5%|████▎                                                                               | 90/1751 [1:33:25<28:02:52, 60.79s/it]                                                                                                                                {'loss': '0.6049', 'grad_norm': '0.252', 'learning_rate': '2e-05', 'ppl': '1.831', 'memory/max_active (GiB)': '71.18', 'memory/max_allocated (GiB)': '71.18', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '50.74', 'tokens/total': 172188336, 'tokens/trainable': 63680596, 'epoch': '0.1542'}
  5%|████▎                                                                               | 90/1751 [1:33:25<28:02:52, 60.79s/it]  5%|████▎                                                                               | 91/1751 [1:34:26<28:02:21, 60.81s/it]                                                                                                                                {'loss': '0.5748', 'grad_norm': '0.2324', 'learning_rate': '2e-05', 'ppl': '1.777', 'memory/max_active (GiB)': '70.01', 'memory/max_allocated (GiB)': '70.01', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '42.82', 'tokens/total': 174122560, 'tokens/trainable': 64398316, 'epoch': '0.1559'}
  5%|████▎                                                                               | 91/1751 [1:34:26<28:02:21, 60.81s/it]  5%|████▍                                                                               | 92/1751 [1:35:26<27:52:16, 60.48s/it]                                                                                                                                {'loss': '0.6215', 'grad_norm': '0.2715', 'learning_rate': '2e-05', 'ppl': '1.862', 'memory/max_active (GiB)': '70.64', 'memory/max_allocated (GiB)': '70.64', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '69.01', 'tokens/total': 175974256, 'tokens/trainable': 65062496, 'epoch': '0.1577'}
  5%|████▍                                                                               | 92/1751 [1:35:26<27:52:16, 60.48s/it]  5%|████▍                                                                               | 93/1751 [1:36:27<27:58:12, 60.73s/it]                                                                                                                                {'loss': '0.5591', 'grad_norm': '0.2275', 'learning_rate': '2e-05', 'ppl': '1.749', 'memory/max_active (GiB)': '76.74', 'memory/max_allocated (GiB)': '76.74', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '73.12', 'tokens/total': 177896384, 'tokens/trainable': 65762784, 'epoch': '0.1594'}
  5%|████▍                                                                               | 93/1751 [1:36:27<27:58:12, 60.73s/it]  5%|████▌                                                                               | 94/1751 [1:37:28<28:00:31, 60.85s/it]                                                                                                                                {'loss': '0.5763', 'grad_norm': '0.2383', 'learning_rate': '2e-05', 'ppl': '1.779', 'memory/max_active (GiB)': '76.05', 'memory/max_allocated (GiB)': '76.05', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '55.35', 'tokens/total': 179817136, 'tokens/trainable': 66476156, 'epoch': '0.1611'}
  5%|████▌                                                                               | 94/1751 [1:37:28<28:00:31, 60.85s/it]  5%|████▌                                                                               | 95/1751 [1:38:29<27:57:41, 60.79s/it]                                                                                                                                {'loss': '0.5836', 'grad_norm': '0.2559', 'learning_rate': '2e-05', 'ppl': '1.793', 'memory/max_active (GiB)': '70.4', 'memory/max_allocated (GiB)': '70.4', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '64.7', 'tokens/total': 181719856, 'tokens/trainable': 67168816, 'epoch': '0.1628'}
  5%|████▌                                                                               | 95/1751 [1:38:29<27:57:41, 60.79s/it]  5%|████▌                                                                               | 96/1751 [1:39:32<28:13:31, 61.40s/it]                                                                                                                                {'loss': '0.5518', 'grad_norm': '0.2168', 'learning_rate': '2e-05', 'ppl': '1.736', 'memory/max_active (GiB)': '74.75', 'memory/max_allocated (GiB)': '74.75', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '38.96', 'tokens/total': 183740240, 'tokens/trainable': 67938688, 'epoch': '0.1645'}
  5%|████▌                                                                               | 96/1751 [1:39:32<28:13:31, 61.40s/it]  6%|████▋                                                                               | 97/1751 [1:40:32<28:04:33, 61.11s/it]                                                                                                                                {'loss': '0.5583', 'grad_norm': '0.2334', 'learning_rate': '2e-05', 'ppl': '1.748', 'memory/max_active (GiB)': '74.5', 'memory/max_allocated (GiB)': '74.5', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '59.12', 'tokens/total': 185665296, 'tokens/trainable': 68661048, 'epoch': '0.1662'}
  6%|████▋                                                                               | 97/1751 [1:40:32<28:04:33, 61.11s/it]  6%|████▋                                                                               | 98/1751 [1:41:32<27:52:44, 60.72s/it]                                                                                                                                {'loss': '0.6144', 'grad_norm': '0.252', 'learning_rate': '2e-05', 'ppl': '1.849', 'memory/max_active (GiB)': '74.97', 'memory/max_allocated (GiB)': '74.97', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '39.16', 'tokens/total': 187548256, 'tokens/trainable': 69304800, 'epoch': '0.1679'}
  6%|████▋                                                                               | 98/1751 [1:41:32<27:52:44, 60.72s/it]  6%|████▋                                                                               | 99/1751 [1:42:30<27:29:13, 59.90s/it]                                                                                                                                {'loss': '0.595', 'grad_norm': '0.2383', 'learning_rate': '2e-05', 'ppl': '1.813', 'memory/max_active (GiB)': '74.15', 'memory/max_allocated (GiB)': '74.15', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '128.3', 'tokens/total': 189332880, 'tokens/trainable': 69971080, 'epoch': '0.1696'}
  6%|████▋                                                                               | 99/1751 [1:42:30<27:29:13, 59.90s/it]  6%|████▋                                                                              | 100/1751 [1:43:28<27:18:00, 59.53s/it]                                                                                                                                {'loss': '0.5954', 'grad_norm': '0.2363', 'learning_rate': '2e-05', 'ppl': '1.814', 'memory/max_active (GiB)': '73.87', 'memory/max_allocated (GiB)': '73.87', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '60.22', 'tokens/total': 191109776, 'tokens/trainable': 70635712, 'epoch': '0.1714'}
  6%|████▋                                                                              | 100/1751 [1:43:28<27:18:00, 59.53s/it]  6%|████▊                                                                              | 101/1751 [1:44:31<27:41:23, 60.41s/it]                                                                                                                                {'loss': '0.5439', 'grad_norm': '0.2168', 'learning_rate': '2e-05', 'ppl': '1.723', 'memory/max_active (GiB)': '75.77', 'memory/max_allocated (GiB)': '75.77', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '100.2', 'tokens/total': 193049408, 'tokens/trainable': 71395224, 'epoch': '0.1731'}
  6%|████▊                                                                              | 101/1751 [1:44:31<27:41:23, 60.41s/it]  6%|████▊                                                                              | 102/1751 [1:45:30<27:33:11, 60.15s/it]                                                                                                                                {'loss': '0.5777', 'grad_norm': '0.2314', 'learning_rate': '2e-05', 'ppl': '1.782', 'memory/max_active (GiB)': '71.39', 'memory/max_allocated (GiB)': '71.39', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '168.9', 'tokens/total': 194919184, 'tokens/trainable': 72083656, 'epoch': '0.1748'}
  6%|████▊                                                                              | 102/1751 [1:45:30<27:33:11, 60.15s/it]  6%|████▉                                                                              | 103/1751 [1:46:28<27:13:18, 59.47s/it]                                                                                                                                {'loss': '0.5905', 'grad_norm': '0.2539', 'learning_rate': '2e-05', 'ppl': '1.805', 'memory/max_active (GiB)': '70.2', 'memory/max_allocated (GiB)': '70.2', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '117.1', 'tokens/total': 196726416, 'tokens/trainable': 72720400, 'epoch': '0.1765'}
  6%|████▉                                                                              | 103/1751 [1:46:28<27:13:18, 59.47s/it]  6%|████▉                                                                              | 104/1751 [1:47:30<27:28:00, 60.04s/it]                                                                                                                                {'loss': '0.5696', 'grad_norm': '0.2129', 'learning_rate': '2e-05', 'ppl': '1.768', 'memory/max_active (GiB)': '70.42', 'memory/max_allocated (GiB)': '70.42', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '50.8', 'tokens/total': 198692608, 'tokens/trainable': 73418912, 'epoch': '0.1782'}
  6%|████▉                                                                              | 104/1751 [1:47:30<27:28:00, 60.04s/it]  6%|████▉                                                                              | 105/1751 [1:48:31<27:40:27, 60.53s/it]                                                                                                                                {'loss': '0.5733', 'grad_norm': '0.2656', 'learning_rate': '1.999e-05', 'ppl': '1.774', 'memory/max_active (GiB)': '75.22', 'memory/max_allocated (GiB)': '75.22', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '78.49', 'tokens/total': 200688016, 'tokens/trainable': 74133688, 'epoch': '0.1799'}
  6%|████▉                                                                              | 105/1751 [1:48:31<27:40:27, 60.53s/it]  6%|█████                                                                              | 106/1751 [1:49:29<27:14:12, 59.61s/it]                                                                                                                                {'loss': '0.6144', 'grad_norm': '0.2617', 'learning_rate': '1.999e-05', 'ppl': '1.849', 'memory/max_active (GiB)': '73.8', 'memory/max_allocated (GiB)': '73.8', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '89.94', 'tokens/total': 202498784, 'tokens/trainable': 74763936, 'epoch': '0.1816'}
  6%|█████                                                                              | 106/1751 [1:49:29<27:14:12, 59.61s/it]  6%|█████                                                                              | 107/1751 [1:50:27<27:03:17, 59.24s/it]                                                                                                                                {'loss': '0.5939', 'grad_norm': '0.2363', 'learning_rate': '1.999e-05', 'ppl': '1.811', 'memory/max_active (GiB)': '68.58', 'memory/max_allocated (GiB)': '68.58', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '67.52', 'tokens/total': 204317552, 'tokens/trainable': 75448560, 'epoch': '0.1834'}
  6%|█████                                                                              | 107/1751 [1:50:27<27:03:17, 59.24s/it]  6%|█████                                                                              | 108/1751 [1:51:26<26:55:00, 58.98s/it]                                                                                                                                {'loss': '0.6083', 'grad_norm': '0.2422', 'learning_rate': '1.999e-05', 'ppl': '1.837', 'memory/max_active (GiB)': '75.65', 'memory/max_allocated (GiB)': '75.65', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '50.35', 'tokens/total': 206121152, 'tokens/trainable': 76127592, 'epoch': '0.1851'}
  6%|█████                                                                              | 108/1751 [1:51:26<26:55:00, 58.98s/it]  6%|█████▏                                                                             | 109/1751 [1:52:26<27:03:37, 59.33s/it]                                                                                                                                {'loss': '0.5978', 'grad_norm': '0.2412', 'learning_rate': '1.999e-05', 'ppl': '1.818', 'memory/max_active (GiB)': '74.48', 'memory/max_allocated (GiB)': '74.48', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '42.51', 'tokens/total': 208044288, 'tokens/trainable': 76848960, 'epoch': '0.1868'}
  6%|█████▏                                                                             | 109/1751 [1:52:26<27:03:37, 59.33s/it]  6%|█████▏                                                                             | 110/1751 [1:53:26<27:13:56, 59.74s/it]                                                                                                                                {'loss': '0.56', 'grad_norm': '0.2354', 'learning_rate': '1.999e-05', 'ppl': '1.751', 'memory/max_active (GiB)': '77.1', 'memory/max_allocated (GiB)': '77.1', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '52.87', 'tokens/total': 209968336, 'tokens/trainable': 77552864, 'epoch': '0.1885'}
  6%|█████▏                                                                             | 110/1751 [1:53:26<27:13:56, 59.74s/it]  6%|█████▎                                                                             | 111/1751 [1:54:28<27:28:07, 60.30s/it]                                                                                                                                {'loss': '0.597', 'grad_norm': '0.2188', 'learning_rate': '1.999e-05', 'ppl': '1.817', 'memory/max_active (GiB)': '75.72', 'memory/max_allocated (GiB)': '75.72', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '22.08', 'tokens/total': 211980640, 'tokens/trainable': 78268088, 'epoch': '0.1902'}
  6%|█████▎                                                                             | 111/1751 [1:54:28<27:28:07, 60.30s/it]  6%|█████▎                                                                             | 112/1751 [1:55:30<27:42:41, 60.87s/it]                                                                                                                                {'loss': '0.5725', 'grad_norm': '0.2256', 'learning_rate': '1.999e-05', 'ppl': '1.773', 'memory/max_active (GiB)': '76.17', 'memory/max_allocated (GiB)': '76.17', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '38.47', 'tokens/total': 213949632, 'tokens/trainable': 79010344, 'epoch': '0.1919'}
  6%|█████▎                                                                             | 112/1751 [1:55:30<27:42:41, 60.87s/it]  6%|█████▎                                                                             | 113/1751 [1:56:32<27:49:33, 61.16s/it]                                                                                                                                {'loss': '0.5702', 'grad_norm': '0.2383', 'learning_rate': '1.999e-05', 'ppl': '1.769', 'memory/max_active (GiB)': '69.53', 'memory/max_allocated (GiB)': '69.53', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '117.2', 'tokens/total': 215872464, 'tokens/trainable': 79738056, 'epoch': '0.1936'}
  6%|█████▎                                                                             | 113/1751 [1:56:32<27:49:33, 61.16s/it]  7%|█████▍                                                                             | 114/1751 [1:57:30<27:23:19, 60.23s/it]                                                                                                                                {'loss': '0.5927', 'grad_norm': '0.2422', 'learning_rate': '1.999e-05', 'ppl': '1.809', 'memory/max_active (GiB)': '74.13', 'memory/max_allocated (GiB)': '74.13', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '105.2', 'tokens/total': 217673504, 'tokens/trainable': 80400000, 'epoch': '0.1953'}
  7%|█████▍                                                                             | 114/1751 [1:57:30<27:23:19, 60.23s/it]  7%|█████▍                                                                             | 115/1751 [1:58:29<27:15:13, 59.97s/it]                                                                                                                                {'loss': '0.5812', 'grad_norm': '0.2393', 'learning_rate': '1.999e-05', 'ppl': '1.788', 'memory/max_active (GiB)': '74.35', 'memory/max_allocated (GiB)': '74.35', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '23.87', 'tokens/total': 219593280, 'tokens/trainable': 81067152, 'epoch': '0.1971'}
  7%|█████▍                                                                             | 115/1751 [1:58:29<27:15:13, 59.97s/it]  7%|█████▍                                                                             | 116/1751 [1:59:29<27:14:16, 59.97s/it]                                                                                                                                {'loss': '0.6062', 'grad_norm': '0.2578', 'learning_rate': '1.999e-05', 'ppl': '1.833', 'memory/max_active (GiB)': '73.45', 'memory/max_allocated (GiB)': '73.45', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '51.99', 'tokens/total': 221512064, 'tokens/trainable': 81779680, 'epoch': '0.1988'}
  7%|█████▍                                                                             | 116/1751 [1:59:29<27:14:16, 59.97s/it]  7%|█████▌                                                                             | 117/1751 [2:00:28<27:02:31, 59.58s/it]                                                                                                                                {'loss': '0.5877', 'grad_norm': '0.2344', 'learning_rate': '1.999e-05', 'ppl': '1.8', 'memory/max_active (GiB)': '76.21', 'memory/max_allocated (GiB)': '76.21', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '70.28', 'tokens/total': 223276000, 'tokens/trainable': 82453760, 'epoch': '0.2005'}
  7%|█████▌                                                                             | 117/1751 [2:00:28<27:02:31, 59.58s/it]  7%|█████▌                                                                             | 118/1751 [2:01:27<26:58:15, 59.46s/it]                                                                                                                                {'loss': '0.5701', 'grad_norm': '0.2217', 'learning_rate': '1.998e-05', 'ppl': '1.768', 'memory/max_active (GiB)': '71.86', 'memory/max_allocated (GiB)': '71.86', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '97.41', 'tokens/total': 225097952, 'tokens/trainable': 83145312, 'epoch': '0.2022'}
  7%|█████▌                                                                             | 118/1751 [2:01:27<26:58:15, 59.46s/it]  7%|█████▋                                                                             | 119/1751 [2:02:28<27:03:30, 59.69s/it]                                                                                                                                {'loss': '0.5886', 'grad_norm': '0.2246', 'learning_rate': '1.998e-05', 'ppl': '1.801', 'memory/max_active (GiB)': '71.83', 'memory/max_allocated (GiB)': '71.83', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '41.28', 'tokens/total': 226962832, 'tokens/trainable': 83860240, 'epoch': '0.2039'}
  7%|█████▋                                                                             | 119/1751 [2:02:28<27:03:30, 59.69s/it]  7%|█████▋                                                                             | 120/1751 [2:03:29<27:19:17, 60.30s/it]                                                                                                                                {'loss': '0.5297', 'grad_norm': '0.2314', 'learning_rate': '1.998e-05', 'ppl': '1.698', 'memory/max_active (GiB)': '71', 'memory/max_allocated (GiB)': '71', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '52.85', 'tokens/total': 228923104, 'tokens/trainable': 84592320, 'epoch': '0.2056'}
  7%|█████▋                                                                             | 120/1751 [2:03:29<27:19:17, 60.30s/it]  7%|█████▋                                                                             | 121/1751 [2:04:28<27:07:24, 59.90s/it]                                                                                                                                {'loss': '0.5949', 'grad_norm': '0.2305', 'learning_rate': '1.998e-05', 'ppl': '1.813', 'memory/max_active (GiB)': '76.27', 'memory/max_allocated (GiB)': '76.27', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '12.88', 'tokens/total': 230746240, 'tokens/trainable': 85297672, 'epoch': '0.2073'}
  7%|█████▋                                                                             | 121/1751 [2:04:28<27:07:24, 59.90s/it]  7%|█████▊                                                                             | 122/1751 [2:05:28<27:02:42, 59.77s/it]                                                                                                                                {'loss': '0.6049', 'grad_norm': '0.2373', 'learning_rate': '1.998e-05', 'ppl': '1.831', 'memory/max_active (GiB)': '74.85', 'memory/max_allocated (GiB)': '74.85', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '126.9', 'tokens/total': 232607232, 'tokens/trainable': 85988848, 'epoch': '0.2091'}
  7%|█████▊                                                                             | 122/1751 [2:05:28<27:02:42, 59.77s/it]  7%|█████▊                                                                             | 123/1751 [2:06:29<27:10:37, 60.10s/it]                                                                                                                                {'loss': '0.5966', 'grad_norm': '0.2354', 'learning_rate': '1.998e-05', 'ppl': '1.816', 'memory/max_active (GiB)': '72.6', 'memory/max_allocated (GiB)': '72.6', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '57.68', 'tokens/total': 234524032, 'tokens/trainable': 86708608, 'epoch': '0.2108'}
  7%|█████▊                                                                             | 123/1751 [2:06:29<27:10:37, 60.10s/it]  7%|█████▉                                                                             | 124/1751 [2:07:28<27:06:31, 59.98s/it]                                                                                                                                {'loss': '0.5395', 'grad_norm': '0.2285', 'learning_rate': '1.998e-05', 'ppl': '1.715', 'memory/max_active (GiB)': '76.83', 'memory/max_allocated (GiB)': '76.83', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '58.81', 'tokens/total': 236369616, 'tokens/trainable': 87364168, 'epoch': '0.2125'}
  7%|█████▉                                                                             | 124/1751 [2:07:28<27:06:31, 59.98s/it]  7%|█████▉                                                                             | 125/1751 [2:08:29<27:09:59, 60.15s/it]                                                                                                                                {'loss': '0.575', 'grad_norm': '0.2402', 'learning_rate': '1.998e-05', 'ppl': '1.777', 'memory/max_active (GiB)': '72.53', 'memory/max_allocated (GiB)': '72.53', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '52.97', 'tokens/total': 238242352, 'tokens/trainable': 88064928, 'epoch': '0.2142'}
  7%|█████▉                                                                             | 125/1751 [2:08:29<27:09:59, 60.15s/it]  7%|█████▉                                                                             | 126/1751 [2:09:29<27:10:58, 60.22s/it]                                                                                                                                {'loss': '0.6135', 'grad_norm': '0.25', 'learning_rate': '1.997e-05', 'ppl': '1.847', 'memory/max_active (GiB)': '74.38', 'memory/max_allocated (GiB)': '74.38', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '70.92', 'tokens/total': 240190784, 'tokens/trainable': 88776408, 'epoch': '0.2159'}
  7%|█████▉                                                                             | 126/1751 [2:09:29<27:10:58, 60.22s/it]  7%|██████                                                                             | 127/1751 [2:10:25<26:36:30, 58.98s/it]                                                                                                                                {'loss': '0.5863', 'grad_norm': '0.25', 'learning_rate': '1.997e-05', 'ppl': '1.797', 'memory/max_active (GiB)': '72.47', 'memory/max_allocated (GiB)': '72.47', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '36.6', 'tokens/total': 241944592, 'tokens/trainable': 89401328, 'epoch': '0.2176'}
  7%|██████                                                                             | 127/1751 [2:10:25<26:36:30, 58.98s/it]  7%|██████                                                                             | 128/1751 [2:11:27<26:54:47, 59.70s/it]                                                                                                                                {'loss': '0.5624', 'grad_norm': '0.2295', 'learning_rate': '1.997e-05', 'ppl': '1.755', 'memory/max_active (GiB)': '76.25', 'memory/max_allocated (GiB)': '76.25', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '110.3', 'tokens/total': 243889200, 'tokens/trainable': 90122768, 'epoch': '0.2193'}
  7%|██████                                                                             | 128/1751 [2:11:27<26:54:47, 59.70s/it]  7%|██████                                                                             | 129/1751 [2:12:28<27:09:53, 60.29s/it]                                                                                                                                {'loss': '0.5838', 'grad_norm': '0.252', 'learning_rate': '1.997e-05', 'ppl': '1.793', 'memory/max_active (GiB)': '77.36', 'memory/max_allocated (GiB)': '77.36', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '123.6', 'tokens/total': 245841840, 'tokens/trainable': 90876552, 'epoch': '0.2211'}
  7%|██████                                                                             | 129/1751 [2:12:28<27:09:53, 60.29s/it]  7%|██████▏                                                                            | 130/1751 [2:13:28<27:01:45, 60.03s/it]                                                                                                                                {'loss': '0.5771', 'grad_norm': '0.2246', 'learning_rate': '1.997e-05', 'ppl': '1.781', 'memory/max_active (GiB)': '74.15', 'memory/max_allocated (GiB)': '74.15', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '116.7', 'tokens/total': 247707200, 'tokens/trainable': 91565664, 'epoch': '0.2228'}
  7%|██████▏                                                                            | 130/1751 [2:13:28<27:01:45, 60.03s/it]  7%|██████▏                                                                            | 131/1751 [2:14:29<27:08:53, 60.33s/it]                                                                                                                                {'loss': '0.5613', 'grad_norm': '0.2109', 'learning_rate': '1.997e-05', 'ppl': '1.753', 'memory/max_active (GiB)': '74.33', 'memory/max_allocated (GiB)': '74.33', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '199.4', 'tokens/total': 249636928, 'tokens/trainable': 92274288, 'epoch': '0.2245'}
  7%|██████▏                                                                            | 131/1751 [2:14:29<27:08:53, 60.33s/it]  8%|██████▎                                                                            | 132/1751 [2:15:28<26:59:46, 60.03s/it]                                                                                                                                {'loss': '0.6027', 'grad_norm': '0.248', 'learning_rate': '1.997e-05', 'ppl': '1.827', 'memory/max_active (GiB)': '74.1', 'memory/max_allocated (GiB)': '74.1', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '48.83', 'tokens/total': 251537888, 'tokens/trainable': 92979968, 'epoch': '0.2262'}
  8%|██████▎                                                                            | 132/1751 [2:15:28<26:59:46, 60.03s/it]  8%|██████▎                                                                            | 133/1751 [2:16:31<27:19:31, 60.80s/it]                                                                                                                                {'loss': '0.5267', 'grad_norm': '0.2119', 'learning_rate': '1.996e-05', 'ppl': '1.693', 'memory/max_active (GiB)': '76.53', 'memory/max_allocated (GiB)': '76.53', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '103.9', 'tokens/total': 253561216, 'tokens/trainable': 93711520, 'epoch': '0.2279'}
  8%|██████▎                                                                            | 133/1751 [2:16:31<27:19:31, 60.80s/it]  8%|██████▎                                                                            | 134/1751 [2:17:28<26:54:07, 59.89s/it]                                                                                                                                {'loss': '0.5706', 'grad_norm': '0.2188', 'learning_rate': '1.996e-05', 'ppl': '1.769', 'memory/max_active (GiB)': '70.26', 'memory/max_allocated (GiB)': '70.26', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '110.6', 'tokens/total': 255386304, 'tokens/trainable': 94384464, 'epoch': '0.2296'}
  8%|██████▎                                                                            | 134/1751 [2:17:28<26:54:07, 59.89s/it]  8%|██████▍                                                                            | 135/1751 [2:18:27<26:42:23, 59.49s/it]                                                                                                                                {'loss': '0.6169', 'grad_norm': '0.249', 'learning_rate': '1.996e-05', 'ppl': '1.853', 'memory/max_active (GiB)': '77.08', 'memory/max_allocated (GiB)': '77.08', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '112.7', 'tokens/total': 257186016, 'tokens/trainable': 95019600, 'epoch': '0.2313'}
  8%|██████▍                                                                            | 135/1751 [2:18:27<26:42:23, 59.49s/it]  8%|██████▍                                                                            | 136/1751 [2:19:27<26:45:45, 59.66s/it]                                                                                                                                {'loss': '0.5609', 'grad_norm': '0.2158', 'learning_rate': '1.996e-05', 'ppl': '1.752', 'memory/max_active (GiB)': '75.9', 'memory/max_allocated (GiB)': '75.9', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '185.9', 'tokens/total': 259062032, 'tokens/trainable': 95745400, 'epoch': '0.233'}
  8%|██████▍                                                                            | 136/1751 [2:19:27<26:45:45, 59.66s/it]  8%|██████▍                                                                            | 137/1751 [2:20:29<26:59:22, 60.20s/it]                                                                                                                                {'loss': '0.5505', 'grad_norm': '0.2305', 'learning_rate': '1.996e-05', 'ppl': '1.734', 'memory/max_active (GiB)': '76.38', 'memory/max_allocated (GiB)': '76.38', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '43.52', 'tokens/total': 260987744, 'tokens/trainable': 96483784, 'epoch': '0.2348'}
  8%|██████▍                                                                            | 137/1751 [2:20:29<26:59:22, 60.20s/it]  8%|██████▌                                                                            | 138/1751 [2:21:30<27:07:42, 60.55s/it]                                                                                                                                {'loss': '0.5502', 'grad_norm': '0.2061', 'learning_rate': '1.996e-05', 'ppl': '1.734', 'memory/max_active (GiB)': '71.74', 'memory/max_allocated (GiB)': '71.74', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '111.6', 'tokens/total': 262961856, 'tokens/trainable': 97209512, 'epoch': '0.2365'}
  8%|██████▌                                                                            | 138/1751 [2:21:30<27:07:42, 60.55s/it]  8%|██████▌                                                                            | 139/1751 [2:22:29<26:53:31, 60.06s/it]                                                                                                                                {'loss': '0.5431', 'grad_norm': '0.2168', 'learning_rate': '1.995e-05', 'ppl': '1.721', 'memory/max_active (GiB)': '74.15', 'memory/max_allocated (GiB)': '74.15', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '37.15', 'tokens/total': 264834096, 'tokens/trainable': 97901720, 'epoch': '0.2382'}
  8%|██████▌                                                                            | 139/1751 [2:22:29<26:53:31, 60.06s/it]  8%|██████▋                                                                            | 140/1751 [2:23:30<27:05:35, 60.54s/it]                                                                                                                                {'loss': '0.5664', 'grad_norm': '0.2178', 'learning_rate': '1.995e-05', 'ppl': '1.762', 'memory/max_active (GiB)': '69.49', 'memory/max_allocated (GiB)': '69.49', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '82.14', 'tokens/total': 266805520, 'tokens/trainable': 98615056, 'epoch': '0.2399'}
  8%|██████▋                                                                            | 140/1751 [2:23:30<27:05:35, 60.54s/it]  8%|██████▋                                                                            | 141/1751 [2:24:30<26:54:54, 60.18s/it]                                                                                                                                {'loss': '0.5795', 'grad_norm': '0.2256', 'learning_rate': '1.995e-05', 'ppl': '1.785', 'memory/max_active (GiB)': '72.54', 'memory/max_allocated (GiB)': '72.54', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '122.6', 'tokens/total': 268601600, 'tokens/trainable': 99285352, 'epoch': '0.2416'}
  8%|██████▋                                                                            | 141/1751 [2:24:30<26:54:54, 60.18s/it]  8%|██████▋                                                                            | 142/1751 [2:25:30<26:56:11, 60.27s/it]                                                                                                                                {'loss': '0.5636', 'grad_norm': '0.2119', 'learning_rate': '1.995e-05', 'ppl': '1.757', 'memory/max_active (GiB)': '75.83', 'memory/max_allocated (GiB)': '75.83', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '61.5', 'tokens/total': 270473184, 'tokens/trainable': 99955200, 'epoch': '0.2433'}
  8%|██████▋                                                                            | 142/1751 [2:25:30<26:56:11, 60.27s/it]  8%|██████▊                                                                            | 143/1751 [2:26:31<26:56:43, 60.33s/it]                                                                                                                                {'loss': '0.5666', 'grad_norm': '0.2412', 'learning_rate': '1.995e-05', 'ppl': '1.762', 'memory/max_active (GiB)': '71.85', 'memory/max_allocated (GiB)': '71.85', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '116.7', 'tokens/total': 272376384, 'tokens/trainable': 100665072, 'epoch': '0.245'}
  8%|██████▊                                                                            | 143/1751 [2:26:31<26:56:43, 60.33s/it]  8%|██████▊                                                                            | 144/1751 [2:27:30<26:45:45, 59.95s/it]                                                                                                                                {'loss': '0.5589', 'grad_norm': '0.2412', 'learning_rate': '1.994e-05', 'ppl': '1.749', 'memory/max_active (GiB)': '70.73', 'memory/max_allocated (GiB)': '70.73', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '98.22', 'tokens/total': 274219136, 'tokens/trainable': 101342640, 'epoch': '0.2468'}
  8%|██████▊                                                                            | 144/1751 [2:27:30<26:45:45, 59.95s/it]  8%|██████▊                                                                            | 145/1751 [2:28:29<26:36:05, 59.63s/it]                                                                                                                                {'loss': '0.5533', 'grad_norm': '0.2119', 'learning_rate': '1.994e-05', 'ppl': '1.739', 'memory/max_active (GiB)': '69.52', 'memory/max_allocated (GiB)': '69.52', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '21.88', 'tokens/total': 276089920, 'tokens/trainable': 102030128, 'epoch': '0.2485'}
  8%|██████▊                                                                            | 145/1751 [2:28:29<26:36:05, 59.63s/it]  8%|██████▉                                                                            | 146/1751 [2:29:26<26:16:52, 58.95s/it]                                                                                                                                {'loss': '0.599', 'grad_norm': '0.2559', 'learning_rate': '1.994e-05', 'ppl': '1.82', 'memory/max_active (GiB)': '69.28', 'memory/max_allocated (GiB)': '69.28', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '57.23', 'tokens/total': 277878208, 'tokens/trainable': 102699888, 'epoch': '0.2502'}
  8%|██████▉                                                                            | 146/1751 [2:29:26<26:16:52, 58.95s/it]  8%|██████▉                                                                            | 147/1751 [2:30:23<25:56:33, 58.23s/it]                                                                                                                                {'loss': '0.5954', 'grad_norm': '0.2285', 'learning_rate': '1.994e-05', 'ppl': '1.814', 'memory/max_active (GiB)': '73.86', 'memory/max_allocated (GiB)': '73.86', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '188.7', 'tokens/total': 279594464, 'tokens/trainable': 103330968, 'epoch': '0.2519'}
  8%|██████▉                                                                            | 147/1751 [2:30:23<25:56:33, 58.23s/it]  8%|███████                                                                            | 148/1751 [2:31:21<25:57:37, 58.30s/it]                                                                                                                                {'loss': '0.5693', 'grad_norm': '0.2393', 'learning_rate': '1.994e-05', 'ppl': '1.767', 'memory/max_active (GiB)': '74.23', 'memory/max_allocated (GiB)': '74.23', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '29.67', 'tokens/total': 281419616, 'tokens/trainable': 103998328, 'epoch': '0.2536'}
  8%|███████                                                                            | 148/1751 [2:31:21<25:57:37, 58.30s/it]  9%|███████                                                                            | 149/1751 [2:32:22<26:21:13, 59.22s/it]                                                                                                                                {'loss': '0.5482', 'grad_norm': '0.2139', 'learning_rate': '1.993e-05', 'ppl': '1.73', 'memory/max_active (GiB)': '77.4', 'memory/max_allocated (GiB)': '77.4', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '15.13', 'tokens/total': 283364256, 'tokens/trainable': 104715920, 'epoch': '0.2553'}
  9%|███████                                                                            | 149/1751 [2:32:22<26:21:13, 59.22s/it]  9%|███████                                                                            | 150/1751 [2:33:22<26:23:44, 59.35s/it]                                                                                                                                {'loss': '0.5606', 'grad_norm': '0.2256', 'learning_rate': '1.993e-05', 'ppl': '1.752', 'memory/max_active (GiB)': '77.2', 'memory/max_allocated (GiB)': '77.2', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '82.64', 'tokens/total': 285212352, 'tokens/trainable': 105387744, 'epoch': '0.257'}
  9%|███████                                                                            | 150/1751 [2:33:22<26:23:44, 59.35s/it]  9%|███████▏                                                                           | 151/1751 [2:34:24<26:41:48, 60.07s/it]                                                                                                                                {'loss': '0.562', 'grad_norm': '0.2266', 'learning_rate': '1.993e-05', 'ppl': '1.754', 'memory/max_active (GiB)': '71.22', 'memory/max_allocated (GiB)': '71.22', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '104', 'tokens/total': 287182656, 'tokens/trainable': 106121200, 'epoch': '0.2588'}
  9%|███████▏                                                                           | 151/1751 [2:34:24<26:41:48, 60.07s/it]  9%|███████▏                                                                           | 152/1751 [2:35:25<26:47:44, 60.33s/it]                                                                                                                                {'loss': '0.5522', 'grad_norm': '0.2295', 'learning_rate': '1.993e-05', 'ppl': '1.737', 'memory/max_active (GiB)': '73.12', 'memory/max_allocated (GiB)': '73.12', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '68.28', 'tokens/total': 289143744, 'tokens/trainable': 106845848, 'epoch': '0.2605'}
  9%|███████▏                                                                           | 152/1751 [2:35:25<26:47:44, 60.33s/it]  9%|███████▎                                                                           | 153/1751 [2:36:28<27:07:08, 61.09s/it]                                                                                                                                {'loss': '0.5338', 'grad_norm': '0.1982', 'learning_rate': '1.992e-05', 'ppl': '1.705', 'memory/max_active (GiB)': '76.04', 'memory/max_allocated (GiB)': '76.04', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '86.83', 'tokens/total': 291151168, 'tokens/trainable': 107589856, 'epoch': '0.2622'}
  9%|███████▎                                                                           | 153/1751 [2:36:28<27:07:08, 61.09s/it]  9%|███████▎                                                                           | 154/1751 [2:37:27<26:52:21, 60.58s/it]                                                                                                                                {'loss': '0.5421', 'grad_norm': '0.2266', 'learning_rate': '1.992e-05', 'ppl': '1.72', 'memory/max_active (GiB)': '76.49', 'memory/max_allocated (GiB)': '76.49', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '121.9', 'tokens/total': 293002368, 'tokens/trainable': 108242792, 'epoch': '0.2639'}
  9%|███████▎                                                                           | 154/1751 [2:37:27<26:52:21, 60.58s/it]  9%|███████▎                                                                           | 155/1751 [2:38:25<26:32:10, 59.86s/it]                                                                                                                                {'loss': '0.6319', 'grad_norm': '0.2373', 'learning_rate': '1.992e-05', 'ppl': '1.881', 'memory/max_active (GiB)': '69.95', 'memory/max_allocated (GiB)': '69.95', 'memory/device_reserved (GiB)': '91.63', 'tokens/train_per_sec_per_gpu': '35.34', 'tokens/total': 294765984, 'tokens/trainable': 108898840, 'epoch': '0.2656'}
  9%|███████▎                                                                           | 155/1751 [2:38:25<26:32:10, 59.86s/it]  9%|███████▍                                                                           | 156/1751 [2:39:25<26:32:32, 59.91s/it]                                                                                                                                {'loss': '0.5384', 'grad_norm': '0.2051', 'learning_rate': '1.992e-05', 'ppl': '1.713', 'memory/max_active (GiB)': '76.79', 'memory/max_allocated (GiB)': '76.79', 'memory/device_reserved (GiB)': '93.05', 'tokens/train_per_sec_per_gpu': '100.5', 'tokens/total': 296650816, 'tokens/trainable': 109597784, 'epoch': '0.2673'}
  9%|███████▍                                                                           | 156/1751 [2:39:25<26:32:32, 59.91s/it]  9%|███████▍                                                                           | 157/1751 [2:40:24<26:23:40, 59.61s/it]                                                                                                                                {'loss': '0.5368', 'grad_norm': '0.21', 'learning_rate': '1.992e-05', 'ppl': '1.711', 'memory/max_active (GiB)': '77.69', 'memory/max_allocated (GiB)': '77.69', 'memory/device_reserved (GiB)': '93.05', 'tokens/train_per_sec_per_gpu': '38.52', 'tokens/total': 298543040, 'tokens/trainable': 110309568, 'epoch': '0.269'}
  9%|███████▍                                                                           | 157/1751 [2:40:24<26:23:40, 59.61s/it]  9%|███████▍                                                                           | 158/1751 [2:41:23<26:14:49, 59.32s/it]                                                                                                                                {'loss': '0.5673', 'grad_norm': '0.2305', 'learning_rate': '1.991e-05', 'ppl': '1.764', 'memory/max_active (GiB)': '71.63', 'memory/max_allocated (GiB)': '71.63', 'memory/device_reserved (GiB)': '93.05', 'tokens/train_per_sec_per_gpu': '116.5', 'tokens/total': 300366048, 'tokens/trainable': 110990664, 'epoch': '0.2707'}
  9%|███████▍                                                                           | 158/1751 [2:41:23<26:14:49, 59.32s/it]  9%|███████▌                                                                           | 159/1751 [2:42:24<26:29:15, 59.90s/it]                                                                                                                                {'loss': '0.5923', 'grad_norm': '0.21', 'learning_rate': '1.991e-05', 'ppl': '1.808', 'memory/max_active (GiB)': '77.1', 'memory/max_allocated (GiB)': '77.1', 'memory/device_reserved (GiB)': '93.05', 'tokens/train_per_sec_per_gpu': '77.62', 'tokens/total': 302272032, 'tokens/trainable': 111726376, 'epoch': '0.2725'}
  9%|███████▌                                                                           | 159/1751 [2:42:24<26:29:15, 59.90s/it]  9%|███████▌                                                                           | 160/1751 [2:43:26<26:41:15, 60.39s/it]                                                                                                                                {'loss': '0.5078', 'grad_norm': '0.2109', 'learning_rate': '1.991e-05', 'ppl': '1.662', 'memory/max_active (GiB)': '73.2', 'memory/max_allocated (GiB)': '73.2', 'memory/device_reserved (GiB)': '93.05', 'tokens/train_per_sec_per_gpu': '133.8', 'tokens/total': 304282912, 'tokens/trainable': 112430512, 'epoch': '0.2742'}
  9%|███████▌                                                                           | 160/1751 [2:43:26<26:41:15, 60.39s/it]  9%|███████▋                                                                           | 161/1751 [2:44:26<26:44:01, 60.53s/it]                                                                                                                                {'loss': '0.5641', 'grad_norm': '0.2207', 'learning_rate': '1.991e-05', 'ppl': '1.758', 'memory/max_active (GiB)': '72.89', 'memory/max_allocated (GiB)': '72.89', 'memory/device_reserved (GiB)': '93.05', 'tokens/train_per_sec_per_gpu': '58.04', 'tokens/total': 306212032, 'tokens/trainable': 113122456, 'epoch': '0.2759'}
  9%|███████▋                                                                           | 161/1751 [2:44:26<26:44:01, 60.53s/it]  9%|███████▋                                                                           | 162/1751 [2:45:25<26:29:09, 60.01s/it]                                                                                                                                {'loss': '0.5541', 'grad_norm': '0.2295', 'learning_rate': '1.99e-05', 'ppl': '1.74', 'memory/max_active (GiB)': '69.64', 'memory/max_allocated (GiB)': '69.64', 'memory/device_reserved (GiB)': '93.05', 'tokens/train_per_sec_per_gpu': '106.1', 'tokens/total': 308051840, 'tokens/trainable': 113770344, 'epoch': '0.2776'}
  9%|███████▋                                                                           | 162/1751 [2:45:25<26:29:09, 60.01s/it]  9%|███████▋                                                                           | 163/1751 [2:46:27<26:43:04, 60.57s/it]                                                                                                                                {'loss': '0.5464', 'grad_norm': '0.2021', 'learning_rate': '1.99e-05', 'ppl': '1.727', 'memory/max_active (GiB)': '72.93', 'memory/max_allocated (GiB)': '72.93', 'memory/device_reserved (GiB)': '93.05', 'tokens/train_per_sec_per_gpu': '85.74', 'tokens/total': 309997056, 'tokens/trainable': 114501136, 'epoch': '0.2793'}
  9%|███████▋                                                                           | 163/1751 [2:46:27<26:43:04, 60.57s/it]  9%|███████▊                                                                           | 164/1751 [2:47:26<26:28:09, 60.04s/it]                                                                                                                                {'loss': '0.576', 'grad_norm': '0.2275', 'learning_rate': '1.99e-05', 'ppl': '1.779', 'memory/max_active (GiB)': '72.87', 'memory/max_allocated (GiB)': '72.87', 'memory/device_reserved (GiB)': '93.05', 'tokens/train_per_sec_per_gpu': '145.2', 'tokens/total': 311846624, 'tokens/trainable': 115164048, 'epoch': '0.281'}
  9%|███████▊                                                                           | 164/1751 [2:47:26<26:28:09, 60.04s/it]  9%|███████▊                                                                           | 165/1751 [2:48:26<26:30:19, 60.16s/it]                                                                                                                                {'loss': '0.5569', 'grad_norm': '0.208', 'learning_rate': '1.989e-05', 'ppl': '1.745', 'memory/max_active (GiB)': '72.4', 'memory/max_allocated (GiB)': '72.4', 'memory/device_reserved (GiB)': '93.05', 'tokens/train_per_sec_per_gpu': '97.88', 'tokens/total': 313752000, 'tokens/trainable': 115883800, 'epoch': '0.2827'}
  9%|███████▊                                                                           | 165/1751 [2:48:26<26:30:19, 60.16s/it]  9%|███████▊                                                                           | 166/1751 [2:49:25<26:14:59, 59.62s/it]                                                                                                                                {'loss': '0.6147', 'grad_norm': '0.2422', 'learning_rate': '1.989e-05', 'ppl': '1.849', 'memory/max_active (GiB)': '71.95', 'memory/max_allocated (GiB)': '71.95', 'memory/device_reserved (GiB)': '93.05', 'tokens/train_per_sec_per_gpu': '116.5', 'tokens/total': 315547840, 'tokens/trainable': 116603136, 'epoch': '0.2845'}
  9%|███████▊                                                                           | 166/1751 [2:49:25<26:14:59, 59.62s/it] 10%|███████▉                                                                           | 167/1751 [2:50:27<26:36:01, 60.46s/it]                                                                                                                                {'loss': '0.5291', 'grad_norm': '0.2051', 'learning_rate': '1.989e-05', 'ppl': '1.697', 'memory/max_active (GiB)': '70.5', 'memory/max_allocated (GiB)': '70.5', 'memory/device_reserved (GiB)': '93.05', 'tokens/train_per_sec_per_gpu': '135.1', 'tokens/total': 317538176, 'tokens/trainable': 117351432, 'epoch': '0.2862'}
 10%|███████▉                                                                           | 167/1751 [2:50:27<26:36:01, 60.46s/it] 10%|███████▉                                                                           | 168/1751 [2:51:27<26:34:10, 60.42s/it]                                                                                                                                {'loss': '0.5656', 'grad_norm': '0.2383', 'learning_rate': '1.989e-05', 'ppl': '1.76', 'memory/max_active (GiB)': '73.88', 'memory/max_allocated (GiB)': '73.88', 'memory/device_reserved (GiB)': '93.05', 'tokens/train_per_sec_per_gpu': '150.1', 'tokens/total': 319445824, 'tokens/trainable': 118068224, 'epoch': '0.2879'}
 10%|███████▉                                                                           | 168/1751 [2:51:27<26:34:10, 60.42s/it] 10%|███████████████▏                                                                                                                                             | 169/1751 [2:52:28<26:34:36, 60.48s/it]                                                                                                                                                                                                          {'loss': '0.5435', 'grad_norm': '0.2256', 'learning_rate': '1.988e-05', 'ppl': '1.722', 'memory/max_active (GiB)': '77.89', 'memory/max_allocated (GiB)': '77.89', 'memory/device_reserved (GiB)': '93.05', 'tokens/train_per_sec_per_gpu': '49.2', 'tokens/total': 321357632, 'tokens/trainable': 118767920, 'epoch': '0.2896'}
 10%|███████████████▏                                                                                                                                             | 169/1751 [2:52:28<26:34:36, 60.48s/it] 10%|███████████████▏                                                                                                                                             | 170/1751 [2:53:28<26:25:48, 60.18s/it]                                                                                                                                                                                                          {'loss': '0.562', 'grad_norm': '0.2158', 'learning_rate': '1.988e-05', 'ppl': '1.754', 'memory/max_active (GiB)': '70.58', 'memory/max_allocated (GiB)': '70.58', 'memory/device_reserved (GiB)': '93.06', 'tokens/train_per_sec_per_gpu': '105.5', 'tokens/total': 323251648, 'tokens/trainable': 119475984, 'epoch': '0.2913'}
 10%|███████████████▏                                                                                                                                             | 170/1751 [2:53:28<26:25:48, 60.18s/it] 10%|███████████████▎                                                                                                                                             | 171/1751 [2:54:28<26:26:55, 60.26s/it]                                                                                                                                                                                                          {'loss': '0.5412', 'grad_norm': '0.2344', 'learning_rate': '1.988e-05', 'ppl': '1.718', 'memory/max_active (GiB)': '73.44', 'memory/max_allocated (GiB)': '73.44', 'memory/device_reserved (GiB)': '93.06', 'tokens/train_per_sec_per_gpu': '81.73', 'tokens/total': 325158848, 'tokens/trainable': 120169064, 'epoch': '0.293'}
 10%|███████████████▎                                                                                                                                             | 171/1751 [2:54:28<26:26:55, 60.26s/it] 10%|███████████████▍                                                                                                                                             | 172/1751 [2:55:28<26:27:20, 60.32s/it]                                                                                                                                                                                                          {'loss': '0.5414', 'grad_norm': '0.207', 'learning_rate': '1.987e-05', 'ppl': '1.718', 'memory/max_active (GiB)': '72.47', 'memory/max_allocated (GiB)': '72.47', 'memory/device_reserved (GiB)': '93.06', 'tokens/train_per_sec_per_gpu': '147.4', 'tokens/total': 327075776, 'tokens/trainable': 120882408, 'epoch': '0.2947'}
 10%|███████████████▍                                                                                                                                             | 172/1751 [2:55:28<26:27:20, 60.32s/it] 10%|███████████████▌                                                                                                                                             | 173/1751 [2:56:29<26:27:33, 60.36s/it]                                                                                                                                                                                                          {'loss': '0.5667', 'grad_norm': '0.2393', 'learning_rate': '1.987e-05', 'ppl': '1.762', 'memory/max_active (GiB)': '71.75', 'memory/max_allocated (GiB)': '71.75', 'memory/device_reserved (GiB)': '93.06', 'tokens/train_per_sec_per_gpu': '55.26', 'tokens/total': 329004032, 'tokens/trainable': 121604704, 'epoch': '0.2965'}
 10%|███████████████▌                                                                                                                                             | 173/1751 [2:56:29<26:27:33, 60.36s/it] 10%|███████████████▌                                                                                                                                             | 174/1751 [2:57:29<26:21:02, 60.15s/it]                                                                                                                                                                                                          {'loss': '0.5648', 'grad_norm': '0.2178', 'learning_rate': '1.987e-05', 'ppl': '1.759', 'memory/max_active (GiB)': '75.14', 'memory/max_allocated (GiB)': '75.14', 'memory/device_reserved (GiB)': '93.06', 'tokens/train_per_sec_per_gpu': '116.4', 'tokens/total': 330899008, 'tokens/trainable': 122339184, 'epoch': '0.2982'}
 10%|███████████████▌                                                                                                                                             | 174/1751 [2:57:29<26:21:02, 60.15s/it] 10%|███████████████▋                                                                                                                                             | 175/1751 [2:58:26<26:00:40, 59.42s/it]                                                                                                                                                                                                          {'loss': '0.5613', 'grad_norm': '0.2285', 'learning_rate': '1.987e-05', 'ppl': '1.753', 'memory/max_active (GiB)': '74.36', 'memory/max_allocated (GiB)': '74.36', 'memory/device_reserved (GiB)': '93.06', 'tokens/train_per_sec_per_gpu': '75.31', 'tokens/total': 332705856, 'tokens/trainable': 122996056, 'epoch': '0.2999'}
 10%|███████████████▋                                                                                                                                             | 175/1751 [2:58:26<26:00:40, 59.42s/it] 10%|███████████████▊                                                                                                                                             | 176/1751 [2:59:25<25:56:38, 59.30s/it]                                                                                                                                                                                                          {'loss': '0.5604', 'grad_norm': '0.2031', 'learning_rate': '1.986e-05', 'ppl': '1.751', 'memory/max_active (GiB)': '73.29', 'memory/max_allocated (GiB)': '73.29', 'memory/device_reserved (GiB)': '93.06', 'tokens/train_per_sec_per_gpu': '182.3', 'tokens/total': 334560096, 'tokens/trainable': 123692696, 'epoch': '0.3016'}
 10%|███████████████▊                                                                                                                                             | 176/1751 [2:59:25<25:56:38, 59.30s/it] 10%|███████████████▊                                                                                                                                             | 177/1751 [3:00:25<26:02:13, 59.55s/it]                                                                                                                                                                                                          {'loss': '0.5891', 'grad_norm': '0.2285', 'learning_rate': '1.986e-05', 'ppl': '1.802', 'memory/max_active (GiB)': '76.36', 'memory/max_allocated (GiB)': '76.36', 'memory/device_reserved (GiB)': '93.06', 'tokens/train_per_sec_per_gpu': '70.19', 'tokens/total': 336511840, 'tokens/trainable': 124427632, 'epoch': '0.3033'}
 10%|███████████████▊                                                                                                                                             | 177/1751 [3:00:25<26:02:13, 59.55s/it] 10%|███████████████▉                                                                                                                                             | 178/1751 [3:01:24<25:54:33, 59.30s/it]                                                                                                                                                                                                          {'loss': '0.5571', 'grad_norm': '0.2236', 'learning_rate': '1.986e-05', 'ppl': '1.746', 'memory/max_active (GiB)': '72.14', 'memory/max_allocated (GiB)': '72.14', 'memory/device_reserved (GiB)': '93.06', 'tokens/train_per_sec_per_gpu': '81.78', 'tokens/total': 338289824, 'tokens/trainable': 125098256, 'epoch': '0.305'}
 10%|███████████████▉                                                                                                                                             | 178/1751 [3:01:24<25:54:33, 59.30s/it] 10%|████████████████                                                                                                                                             | 179/1751 [3:02:23<25:49:42, 59.15s/it]                                                                                                                                                                                                          {'loss': '0.5804', 'grad_norm': '0.2246', 'learning_rate': '1.985e-05', 'ppl': '1.787', 'memory/max_active (GiB)': '76.27', 'memory/max_allocated (GiB)': '76.27', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '56.6', 'tokens/total': 340114912, 'tokens/trainable': 125794640, 'epoch': '0.3067'}
 10%|████████████████                                                                                                                                             | 179/1751 [3:02:23<25:49:42, 59.15s/it] 10%|████████████████▏                                                                                                                                            | 180/1751 [3:03:22<25:46:54, 59.08s/it]                                                                                                                                                                                                          {'loss': '0.5645', 'grad_norm': '0.2324', 'learning_rate': '1.985e-05', 'ppl': '1.759', 'memory/max_active (GiB)': '75.33', 'memory/max_allocated (GiB)': '75.33', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '54.41', 'tokens/total': 341930048, 'tokens/trainable': 126498560, 'epoch': '0.3084'}
 10%|████████████████▏                                                                                                                                            | 180/1751 [3:03:22<25:46:54, 59.08s/it] 10%|████████████████▏                                                                                                                                            | 181/1751 [3:04:23<26:02:40, 59.72s/it]                                                                                                                                                                                                          {'loss': '0.5729', 'grad_norm': '0.2168', 'learning_rate': '1.985e-05', 'ppl': '1.773', 'memory/max_active (GiB)': '71.71', 'memory/max_allocated (GiB)': '71.71', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '108.2', 'tokens/total': 343855520, 'tokens/trainable': 127193584, 'epoch': '0.3102'}
 10%|████████████████▏                                                                                                                                            | 181/1751 [3:04:23<26:02:40, 59.72s/it] 10%|████████████████▎                                                                                                                                            | 182/1751 [3:05:22<25:53:38, 59.41s/it]                                                                                                                                                                                                          {'loss': '0.5404', 'grad_norm': '0.2158', 'learning_rate': '1.984e-05', 'ppl': '1.717', 'memory/max_active (GiB)': '76.77', 'memory/max_allocated (GiB)': '76.77', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '55.96', 'tokens/total': 345671936, 'tokens/trainable': 127907472, 'epoch': '0.3119'}
 10%|████████████████▎                                                                                                                                            | 182/1751 [3:05:22<25:53:38, 59.41s/it] 10%|████████████████▍                                                                                                                                            | 183/1751 [3:06:21<25:47:06, 59.20s/it]                                                                                                                                                                                                          {'loss': '0.5671', 'grad_norm': '0.2344', 'learning_rate': '1.984e-05', 'ppl': '1.763', 'memory/max_active (GiB)': '72.49', 'memory/max_allocated (GiB)': '72.49', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '55.35', 'tokens/total': 347493376, 'tokens/trainable': 128575256, 'epoch': '0.3136'}
 10%|████████████████▍                                                                                                                                            | 183/1751 [3:06:21<25:47:06, 59.20s/it] 11%|████████████████▍                                                                                                                                            | 184/1751 [3:07:19<25:39:15, 58.94s/it]                                                                                                                                                                                                          {'loss': '0.575', 'grad_norm': '0.2451', 'learning_rate': '1.984e-05', 'ppl': '1.777', 'memory/max_active (GiB)': '72.71', 'memory/max_allocated (GiB)': '72.71', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '54.06', 'tokens/total': 349342944, 'tokens/trainable': 129232760, 'epoch': '0.3153'}
 11%|████████████████▍                                                                                                                                            | 184/1751 [3:07:19<25:39:15, 58.94s/it] 11%|████████████████▌                                                                                                                                            | 185/1751 [3:08:18<25:42:18, 59.09s/it]                                                                                                                                                                                                          {'loss': '0.5638', 'grad_norm': '0.2041', 'learning_rate': '1.983e-05', 'ppl': '1.757', 'memory/max_active (GiB)': '67.82', 'memory/max_allocated (GiB)': '67.82', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '45.97', 'tokens/total': 351237568, 'tokens/trainable': 129949480, 'epoch': '0.317'}
 11%|████████████████▌                                                                                                                                            | 185/1751 [3:08:18<25:42:18, 59.09s/it] 11%|████████████████▋                                                                                                                                            | 186/1751 [3:09:20<25:58:30, 59.75s/it]                                                                                                                                                                                                          {'loss': '0.5655', 'grad_norm': '0.2891', 'learning_rate': '1.983e-05', 'ppl': '1.76', 'memory/max_active (GiB)': '73.82', 'memory/max_allocated (GiB)': '73.82', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '76.71', 'tokens/total': 353182912, 'tokens/trainable': 130673640, 'epoch': '0.3187'}
 11%|████████████████▋                                                                                                                                            | 186/1751 [3:09:20<25:58:30, 59.75s/it] 11%|████████████████▊                                                                                                                                            | 187/1751 [3:10:20<26:04:53, 60.03s/it]                                                                                                                                                                                                          {'loss': '0.5683', 'grad_norm': '0.2109', 'learning_rate': '1.983e-05', 'ppl': '1.765', 'memory/max_active (GiB)': '69.42', 'memory/max_allocated (GiB)': '69.42', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '54.71', 'tokens/total': 355090624, 'tokens/trainable': 131381384, 'epoch': '0.3204'}
 11%|████████████████▊                                                                                                                                            | 187/1751 [3:10:20<26:04:53, 60.03s/it] 11%|████████████████▊                                                                                                                                            | 188/1751 [3:11:20<25:57:52, 59.80s/it]                                                                                                                                                                                                          {'loss': '0.5716', 'grad_norm': '0.2373', 'learning_rate': '1.982e-05', 'ppl': '1.771', 'memory/max_active (GiB)': '75.86', 'memory/max_allocated (GiB)': '75.86', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '145.8', 'tokens/total': 356952320, 'tokens/trainable': 132059936, 'epoch': '0.3222'}
 11%|████████████████▊                                                                                                                                            | 188/1751 [3:11:20<25:57:52, 59.80s/it] 11%|████████████████▉                                                                                                                                            | 189/1751 [3:12:19<25:57:05, 59.81s/it]                                                                                                                                                                                                          {'loss': '0.5728', 'grad_norm': '0.252', 'learning_rate': '1.982e-05', 'ppl': '1.773', 'memory/max_active (GiB)': '75.46', 'memory/max_allocated (GiB)': '75.46', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '61.13', 'tokens/total': 358792640, 'tokens/trainable': 132770232, 'epoch': '0.3239'}
 11%|████████████████▉                                                                                                                                            | 189/1751 [3:12:19<25:57:05, 59.81s/it] 11%|█████████████████                                                                                                                                            | 190/1751 [3:13:20<26:00:44, 59.99s/it]                                                                                                                                                                                                          {'loss': '0.579', 'grad_norm': '0.2197', 'learning_rate': '1.982e-05', 'ppl': '1.784', 'memory/max_active (GiB)': '75.49', 'memory/max_allocated (GiB)': '75.49', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '184.5', 'tokens/total': 360696064, 'tokens/trainable': 133508496, 'epoch': '0.3256'}
 11%|█████████████████                                                                                                                                            | 190/1751 [3:13:20<26:00:44, 59.99s/it] 11%|█████████████████▏                                                                                                                                           | 191/1751 [3:14:20<26:04:26, 60.17s/it]                                                                                                                                                                                                          {'loss': '0.5278', 'grad_norm': '0.2373', 'learning_rate': '1.981e-05', 'ppl': '1.695', 'memory/max_active (GiB)': '69.89', 'memory/max_allocated (GiB)': '69.89', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '50.75', 'tokens/total': 362607936, 'tokens/trainable': 134190408, 'epoch': '0.3273'}
 11%|█████████████████▏                                                                                                                                           | 191/1751 [3:14:20<26:04:26, 60.17s/it] 11%|█████████████████▏                                                                                                                                           | 192/1751 [3:15:19<25:53:20, 59.78s/it]                                                                                                                                                                                                          {'loss': '0.5966', 'grad_norm': '0.2422', 'learning_rate': '1.981e-05', 'ppl': '1.816', 'memory/max_active (GiB)': '77.15', 'memory/max_allocated (GiB)': '77.15', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '117.2', 'tokens/total': 364409408, 'tokens/trainable': 134839424, 'epoch': '0.329'}
 11%|█████████████████▏                                                                                                                                           | 192/1751 [3:15:19<25:53:20, 59.78s/it] 11%|█████████████████▎                                                                                                                                           | 193/1751 [3:16:20<26:01:18, 60.13s/it]                                                                                                                                                                                                          {'loss': '0.5541', 'grad_norm': '0.2285', 'learning_rate': '1.98e-05', 'ppl': '1.74', 'memory/max_active (GiB)': '74.29', 'memory/max_allocated (GiB)': '74.29', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '203.3', 'tokens/total': 366360960, 'tokens/trainable': 135574288, 'epoch': '0.3307'}
 11%|█████████████████▎                                                                                                                                           | 193/1751 [3:16:20<26:01:18, 60.13s/it] 11%|█████████████████▍                                                                                                                                           | 194/1751 [3:17:21<26:07:07, 60.39s/it]                                                                                                                                                                                                          {'loss': '0.5122', 'grad_norm': '0.2305', 'learning_rate': '1.98e-05', 'ppl': '1.669', 'memory/max_active (GiB)': '78.06', 'memory/max_allocated (GiB)': '78.06', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '99.4', 'tokens/total': 368302432, 'tokens/trainable': 136291680, 'epoch': '0.3324'}
 11%|█████████████████▍                                                                                                                                           | 194/1751 [3:17:21<26:07:07, 60.39s/it] 11%|█████████████████▍                                                                                                                                           | 195/1751 [3:18:19<25:43:21, 59.51s/it]                                                                                                                                                                                                          {'loss': '0.5648', 'grad_norm': '0.2295', 'learning_rate': '1.98e-05', 'ppl': '1.759', 'memory/max_active (GiB)': '75.5', 'memory/max_allocated (GiB)': '75.5', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '187.9', 'tokens/total': 370070848, 'tokens/trainable': 136955744, 'epoch': '0.3341'}
 11%|█████████████████▍                                                                                                                                           | 195/1751 [3:18:19<25:43:21, 59.51s/it] 11%|█████████████████▌                                                                                                                                           | 196/1751 [3:19:17<25:35:30, 59.25s/it]                                                                                                                                                                                                          {'loss': '0.5971', 'grad_norm': '0.2168', 'learning_rate': '1.979e-05', 'ppl': '1.817', 'memory/max_active (GiB)': '72.76', 'memory/max_allocated (GiB)': '72.76', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '75.45', 'tokens/total': 371963104, 'tokens/trainable': 137648128, 'epoch': '0.3359'}
 11%|█████████████████▌                                                                                                                                           | 196/1751 [3:19:17<25:35:30, 59.25s/it] 11%|█████████████████▋                                                                                                                                           | 197/1751 [3:20:18<25:47:28, 59.75s/it]                                                                                                                                                                                                          {'loss': '0.5891', 'grad_norm': '0.2314', 'learning_rate': '1.979e-05', 'ppl': '1.802', 'memory/max_active (GiB)': '76.52', 'memory/max_allocated (GiB)': '76.52', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '45.54', 'tokens/total': 373906304, 'tokens/trainable': 138350752, 'epoch': '0.3376'}
 11%|█████████████████▋                                                                                                                                           | 197/1751 [3:20:18<25:47:28, 59.75s/it] 11%|█████████████████▊                                                                                                                                           | 198/1751 [3:21:17<25:35:56, 59.34s/it]                                                                                                                                                                                                          {'loss': '0.6074', 'grad_norm': '0.2275', 'learning_rate': '1.979e-05', 'ppl': '1.836', 'memory/max_active (GiB)': '73.69', 'memory/max_allocated (GiB)': '73.69', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '70.16', 'tokens/total': 375711872, 'tokens/trainable': 139029904, 'epoch': '0.3393'}
 11%|█████████████████▊                                                                                                                                           | 198/1751 [3:21:17<25:35:56, 59.34s/it] 11%|█████████████████▊                                                                                                                                           | 199/1751 [3:22:17<25:41:45, 59.60s/it]                                                                                                                                                                                                          {'loss': '0.5427', 'grad_norm': '0.21', 'learning_rate': '1.978e-05', 'ppl': '1.721', 'memory/max_active (GiB)': '72.47', 'memory/max_allocated (GiB)': '72.47', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '40.9', 'tokens/total': 377597216, 'tokens/trainable': 139713712, 'epoch': '0.341'}
 11%|█████████████████▊                                                                                                                                           | 199/1751 [3:22:17<25:41:45, 59.60s/it] 11%|█████████████████▉                                                                                                                                           | 200/1751 [3:23:17<25:43:29, 59.71s/it]                                                                                                                                                                                                          {'loss': '0.5531', 'grad_norm': '0.2148', 'learning_rate': '1.978e-05', 'ppl': '1.739', 'memory/max_active (GiB)': '74.77', 'memory/max_allocated (GiB)': '74.77', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '44.11', 'tokens/total': 379460320, 'tokens/trainable': 140418432, 'epoch': '0.3427'}
 11%|█████████████████▉                                                                                                                                           | 200/1751 [3:23:17<25:43:29, 59.71s/it] 11%|██████████████████                                                                                                                                           | 201/1751 [3:24:15<25:33:22, 59.36s/it]                                                                                                                                                                                                          {'loss': '0.579', 'grad_norm': '0.2148', 'learning_rate': '1.977e-05', 'ppl': '1.784', 'memory/max_active (GiB)': '72.95', 'memory/max_allocated (GiB)': '72.95', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '233.6', 'tokens/total': 381300672, 'tokens/trainable': 141086064, 'epoch': '0.3444'}
 11%|██████████████████                                                                                                                                           | 201/1751 [3:24:15<25:33:22, 59.36s/it] 12%|██████████████████                                                                                                                                           | 202/1751 [3:25:12<25:15:36, 58.71s/it]                                                                                                                                                                                                          {'loss': '0.5524', 'grad_norm': '0.2207', 'learning_rate': '1.977e-05', 'ppl': '1.737', 'memory/max_active (GiB)': '68.42', 'memory/max_allocated (GiB)': '68.42', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '64.02', 'tokens/total': 383047552, 'tokens/trainable': 141748544, 'epoch': '0.3461'}
 12%|██████████████████                                                                                                                                           | 202/1751 [3:25:12<25:15:36, 58.71s/it] 12%|██████████████████▏                                                                                                                                          | 203/1751 [3:26:12<25:17:14, 58.81s/it]                                                                                                                                                                                                          {'loss': '0.5663', 'grad_norm': '0.2246', 'learning_rate': '1.977e-05', 'ppl': '1.762', 'memory/max_active (GiB)': '72.63', 'memory/max_allocated (GiB)': '72.63', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '26.51', 'tokens/total': 384909728, 'tokens/trainable': 142454608, 'epoch': '0.3479'}
 12%|██████████████████▏                                                                                                                                          | 203/1751 [3:26:12<25:17:14, 58.81s/it] 12%|██████████████████▎                                                                                                                                          | 204/1751 [3:27:11<25:23:35, 59.09s/it]                                                                                                                                                                                                          {'loss': '0.5403', 'grad_norm': '0.2207', 'learning_rate': '1.976e-05', 'ppl': '1.717', 'memory/max_active (GiB)': '71', 'memory/max_allocated (GiB)': '71', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '31.37', 'tokens/total': 386832160, 'tokens/trainable': 143175536, 'epoch': '0.3496'}
 12%|██████████████████▎                                                                                                                                          | 204/1751 [3:27:11<25:23:35, 59.09s/it] 12%|██████████▎                                                                             | 205/1751 [3:28:12<25:37:30, 59.67s/it]                                                                                                                                                                                                          {'loss': '0.5473', 'grad_norm': '0.2188', 'learning_rate': '1.976e-05', 'ppl': '1.729', 'memory/max_active (GiB)': '73.77', 'memory/max_allocated (GiB)': '73.77', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '142.6', 'tokens/total': 388766816, 'tokens/trainable': 143899216, 'epoch': '0.3513'}
 12%|██████████▎                                                                             | 205/1751 [3:28:12<25:37:30, 59.67s/it] 12%|██████████▎                                                                             | 206/1751 [3:29:13<25:44:10, 59.97s/it]                                                                                                                                     {'loss': '0.5713', 'grad_norm': '0.2188', 'learning_rate': '1.975e-05', 'ppl': '1.771', 'memory/max_active (GiB)': '74.77', 'memory/max_allocated (GiB)': '74.77', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '119.8', 'tokens/total': 390739936, 'tokens/trainable': 144631120, 'epoch': '0.353'}
 12%|██████████▎                                                                             | 206/1751 [3:29:13<25:44:10, 59.97s/it] 12%|██████████▍                                                                             | 207/1751 [3:30:11<25:29:14, 59.43s/it]                                                                                                                                     {'loss': '0.5098', 'grad_norm': '0.2305', 'learning_rate': '1.975e-05', 'ppl': '1.665', 'memory/max_active (GiB)': '75.14', 'memory/max_allocated (GiB)': '75.14', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '53.33', 'tokens/total': 392610592, 'tokens/trainable': 145312016, 'epoch': '0.3547'}
 12%|██████████▍                                                                             | 207/1751 [3:30:11<25:29:14, 59.43s/it] 12%|██████████▍                                                                             | 208/1751 [3:31:10<25:27:33, 59.40s/it]                                                                                                                                     {'loss': '0.5318', 'grad_norm': '0.1982', 'learning_rate': '1.974e-05', 'ppl': '1.702', 'memory/max_active (GiB)': '76.06', 'memory/max_allocated (GiB)': '76.06', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '60.63', 'tokens/total': 394476064, 'tokens/trainable': 146007712, 'epoch': '0.3564'}
 12%|██████████▍                                                                             | 208/1751 [3:31:10<25:27:33, 59.40s/it] 12%|██████████▌                                                                             | 209/1751 [3:32:06<24:59:05, 58.33s/it]                                                                                                                                     {'loss': '0.5645', 'grad_norm': '0.2344', 'learning_rate': '1.974e-05', 'ppl': '1.759', 'memory/max_active (GiB)': '76.19', 'memory/max_allocated (GiB)': '76.19', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '109.9', 'tokens/total': 396221344, 'tokens/trainable': 146631648, 'epoch': '0.3581'}
 12%|██████████▌                                                                             | 209/1751 [3:32:06<24:59:05, 58.33s/it] 12%|██████████▌                                                                             | 210/1751 [3:33:06<25:06:45, 58.67s/it]                                                                                                                                     {'loss': '0.6025', 'grad_norm': '0.2441', 'learning_rate': '1.974e-05', 'ppl': '1.827', 'memory/max_active (GiB)': '67.98', 'memory/max_allocated (GiB)': '67.98', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '131.2', 'tokens/total': 397990592, 'tokens/trainable': 147282800, 'epoch': '0.3599'}
 12%|██████████▌                                                                             | 210/1751 [3:33:06<25:06:45, 58.67s/it] 12%|██████████▌                                                                             | 211/1751 [3:34:08<25:36:10, 59.85s/it]                                                                                                                                     {'loss': '0.5373', 'grad_norm': '0.2197', 'learning_rate': '1.973e-05', 'ppl': '1.711', 'memory/max_active (GiB)': '76.26', 'memory/max_allocated (GiB)': '76.26', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '145.5', 'tokens/total': 399995072, 'tokens/trainable': 148017184, 'epoch': '0.3616'}
 12%|██████████▌                                                                             | 211/1751 [3:34:08<25:36:10, 59.85s/it] 12%|██████████▋                                                                             | 212/1751 [3:35:05<25:13:17, 59.00s/it]                                                                                                                                     {'loss': '0.5707', 'grad_norm': '0.2451', 'learning_rate': '1.973e-05', 'ppl': '1.769', 'memory/max_active (GiB)': '64.42', 'memory/max_allocated (GiB)': '64.42', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '59.72', 'tokens/total': 401770080, 'tokens/trainable': 148690480, 'epoch': '0.3633'}
 12%|██████████▋                                                                             | 212/1751 [3:35:05<25:13:17, 59.00s/it] 12%|██████████▋                                                                             | 213/1751 [3:36:09<25:47:48, 60.38s/it]                                                                                                                                     {'loss': '0.5485', 'grad_norm': '0.208', 'learning_rate': '1.972e-05', 'ppl': '1.731', 'memory/max_active (GiB)': '72.52', 'memory/max_allocated (GiB)': '72.52', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '87.91', 'tokens/total': 403802368, 'tokens/trainable': 149469472, 'epoch': '0.365'}
 12%|██████████▋                                                                             | 213/1751 [3:36:09<25:47:48, 60.38s/it] 12%|██████████▊                                                                             | 214/1751 [3:37:11<25:57:35, 60.80s/it]                                                                                                                                     {'loss': '0.5242', 'grad_norm': '0.2031', 'learning_rate': '1.972e-05', 'ppl': '1.689', 'memory/max_active (GiB)': '75.51', 'memory/max_allocated (GiB)': '75.51', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '125.2', 'tokens/total': 405810880, 'tokens/trainable': 150229248, 'epoch': '0.3667'}
 12%|██████████▊                                                                             | 214/1751 [3:37:11<25:57:35, 60.80s/it] 12%|██████████▊                                                                             | 215/1751 [3:38:14<26:11:33, 61.39s/it]                                                                                                                                     {'loss': '0.5392', 'grad_norm': '0.209', 'learning_rate': '1.971e-05', 'ppl': '1.715', 'memory/max_active (GiB)': '76.48', 'memory/max_allocated (GiB)': '76.48', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '66.22', 'tokens/total': 407836576, 'tokens/trainable': 151006304, 'epoch': '0.3684'}
 12%|██████████▊                                                                             | 215/1751 [3:38:14<26:11:33, 61.39s/it] 12%|██████████▊                                                                             | 216/1751 [3:39:13<25:54:12, 60.75s/it]                                                                                                                                     {'loss': '0.5636', 'grad_norm': '0.2168', 'learning_rate': '1.971e-05', 'ppl': '1.757', 'memory/max_active (GiB)': '71.32', 'memory/max_allocated (GiB)': '71.32', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '43.57', 'tokens/total': 409734976, 'tokens/trainable': 151709216, 'epoch': '0.3701'}
 12%|██████████▊                                                                             | 216/1751 [3:39:13<25:54:12, 60.75s/it] 12%|██████████▉                                                                             | 217/1751 [3:40:12<25:41:13, 60.28s/it]                                                                                                                                     {'loss': '0.5556', 'grad_norm': '0.2119', 'learning_rate': '1.97e-05', 'ppl': '1.743', 'memory/max_active (GiB)': '68.81', 'memory/max_allocated (GiB)': '68.81', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '60.25', 'tokens/total': 411614432, 'tokens/trainable': 152390352, 'epoch': '0.3718'}
 12%|██████████▉                                                                             | 217/1751 [3:40:12<25:41:13, 60.28s/it] 12%|██████████▉                                                                             | 218/1751 [3:41:13<25:45:31, 60.49s/it]                                                                                                                                     {'loss': '0.5437', 'grad_norm': '0.1973', 'learning_rate': '1.97e-05', 'ppl': '1.722', 'memory/max_active (GiB)': '70.09', 'memory/max_allocated (GiB)': '70.09', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '102.2', 'tokens/total': 413559424, 'tokens/trainable': 153105728, 'epoch': '0.3736'}
 12%|██████████▉                                                                             | 218/1751 [3:41:13<25:45:31, 60.49s/it] 13%|███████████                                                                             | 219/1751 [3:42:14<25:50:45, 60.73s/it]                                                                                                                                     {'loss': '0.5814', 'grad_norm': '0.2227', 'learning_rate': '1.97e-05', 'ppl': '1.789', 'memory/max_active (GiB)': '76.9', 'memory/max_allocated (GiB)': '76.9', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '73.17', 'tokens/total': 415495712, 'tokens/trainable': 153832256, 'epoch': '0.3753'}
 13%|███████████                                                                             | 219/1751 [3:42:14<25:50:45, 60.73s/it] 13%|███████████                                                                             | 220/1751 [3:43:14<25:45:11, 60.56s/it]                                                                                                                                     {'loss': '0.5258', 'grad_norm': '0.2158', 'learning_rate': '1.969e-05', 'ppl': '1.692', 'memory/max_active (GiB)': '69.12', 'memory/max_allocated (GiB)': '69.12', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '79.42', 'tokens/total': 417353024, 'tokens/trainable': 154501152, 'epoch': '0.377'}
 13%|███████████                                                                             | 220/1751 [3:43:14<25:45:11, 60.56s/it] 13%|███████████                                                                             | 221/1751 [3:44:13<25:32:21, 60.09s/it]                                                                                                                                     {'loss': '0.5539', 'grad_norm': '0.2148', 'learning_rate': '1.969e-05', 'ppl': '1.74', 'memory/max_active (GiB)': '72.26', 'memory/max_allocated (GiB)': '72.26', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '41.2', 'tokens/total': 419235712, 'tokens/trainable': 155193104, 'epoch': '0.3787'}
 13%|███████████                                                                             | 221/1751 [3:44:13<25:32:21, 60.09s/it] 13%|███████████▏                                                                            | 222/1751 [3:45:09<24:56:45, 58.73s/it]                                                                                                                                     {'loss': '0.57', 'grad_norm': '0.2188', 'learning_rate': '1.968e-05', 'ppl': '1.768', 'memory/max_active (GiB)': '70.4', 'memory/max_allocated (GiB)': '70.4', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '134.2', 'tokens/total': 420913472, 'tokens/trainable': 155834608, 'epoch': '0.3804'}
 13%|███████████▏                                                                            | 222/1751 [3:45:09<24:56:45, 58.73s/it] 13%|███████████▏                                                                            | 223/1751 [3:46:09<25:02:52, 59.01s/it]                                                                                                                                     {'loss': '0.5548', 'grad_norm': '0.2373', 'learning_rate': '1.968e-05', 'ppl': '1.742', 'memory/max_active (GiB)': '75.8', 'memory/max_allocated (GiB)': '75.8', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '81.88', 'tokens/total': 422800896, 'tokens/trainable': 156505824, 'epoch': '0.3821'}
 13%|███████████▏                                                                            | 223/1751 [3:46:09<25:02:52, 59.01s/it] 13%|███████████▎                                                                            | 224/1751 [3:47:09<25:12:13, 59.42s/it]                                                                                                                                     {'loss': '0.5142', 'grad_norm': '0.2207', 'learning_rate': '1.967e-05', 'ppl': '1.672', 'memory/max_active (GiB)': '75.93', 'memory/max_allocated (GiB)': '75.93', 'memory/device_reserved (GiB)': '93.93', 'tokens/train_per_sec_per_gpu': '44.64', 'tokens/total': 424735776, 'tokens/trainable': 157225728, 'epoch': '0.3838'}
 13%|███████████▎                                                                            | 224/1751 [3:47:09<25:12:13, 59.42s/it] 13%|███████████▎                                                                            | 225/1751 [3:48:09<25:18:58, 59.72s/it]                                                                                                                                     {'loss': '0.5572', 'grad_norm': '0.2217', 'learning_rate': '1.967e-05', 'ppl': '1.746', 'memory/max_active (GiB)': '76.18', 'memory/max_allocated (GiB)': '76.18', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '113.5', 'tokens/total': 426667776, 'tokens/trainable': 157949696, 'epoch': '0.3856'}
 13%|███████████▎                                                                            | 225/1751 [3:48:09<25:18:58, 59.72s/it] 13%|███████████▎                                                                            | 226/1751 [3:49:08<25:08:26, 59.35s/it]                                                                                                                                     {'loss': '0.5708', 'grad_norm': '0.2236', 'learning_rate': '1.966e-05', 'ppl': '1.77', 'memory/max_active (GiB)': '76.75', 'memory/max_allocated (GiB)': '76.75', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '49.88', 'tokens/total': 428496864, 'tokens/trainable': 158626864, 'epoch': '0.3873'}
 13%|███████████▎                                                                            | 226/1751 [3:49:08<25:08:26, 59.35s/it] 13%|███████████▍                                                                            | 227/1751 [3:50:07<25:08:15, 59.38s/it]                                                                                                                                     {'loss': '0.576', 'grad_norm': '0.2188', 'learning_rate': '1.966e-05', 'ppl': '1.779', 'memory/max_active (GiB)': '76.18', 'memory/max_allocated (GiB)': '76.18', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '81.84', 'tokens/total': 430366784, 'tokens/trainable': 159305008, 'epoch': '0.389'}
 13%|███████████▍                                                                            | 227/1751 [3:50:07<25:08:15, 59.38s/it] 13%|███████████▍                                                                            | 228/1751 [3:51:09<25:25:32, 60.10s/it]                                                                                                                                     {'loss': '0.5442', 'grad_norm': '0.209', 'learning_rate': '1.965e-05', 'ppl': '1.723', 'memory/max_active (GiB)': '73.48', 'memory/max_allocated (GiB)': '73.48', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '109.6', 'tokens/total': 432288832, 'tokens/trainable': 160020128, 'epoch': '0.3907'}
 13%|███████████▍                                                                            | 228/1751 [3:51:09<25:25:32, 60.10s/it] 13%|███████████▌                                                                            | 229/1751 [3:52:10<25:29:27, 60.29s/it]                                                                                                                                     {'loss': '0.5543', 'grad_norm': '0.1992', 'learning_rate': '1.965e-05', 'ppl': '1.741', 'memory/max_active (GiB)': '72.62', 'memory/max_allocated (GiB)': '72.62', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '132.1', 'tokens/total': 434179456, 'tokens/trainable': 160724528, 'epoch': '0.3924'}
 13%|███████████▌                                                                            | 229/1751 [3:52:10<25:29:27, 60.29s/it] 13%|███████████▌                                                                            | 230/1751 [3:53:09<25:19:48, 59.95s/it]                                                                                                                                     {'loss': '0.5727', 'grad_norm': '0.2285', 'learning_rate': '1.964e-05', 'ppl': '1.773', 'memory/max_active (GiB)': '69.66', 'memory/max_allocated (GiB)': '69.66', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '84.76', 'tokens/total': 436091296, 'tokens/trainable': 161416240, 'epoch': '0.3941'}
 13%|███████████▌                                                                            | 230/1751 [3:53:09<25:19:48, 59.95s/it] 13%|███████████▌                                                                            | 231/1751 [3:54:10<25:23:17, 60.13s/it]                                                                                                                                     {'loss': '0.5289', 'grad_norm': '0.2012', 'learning_rate': '1.964e-05', 'ppl': '1.697', 'memory/max_active (GiB)': '70.79', 'memory/max_allocated (GiB)': '70.79', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '53.69', 'tokens/total': 437995872, 'tokens/trainable': 162140656, 'epoch': '0.3958'}
 13%|███████████▌                                                                            | 231/1751 [3:54:10<25:23:17, 60.13s/it] 13%|███████████▋                                                                            | 232/1751 [3:55:07<25:04:25, 59.42s/it]                                                                                                                                     {'loss': '0.5309', 'grad_norm': '0.21', 'learning_rate': '1.963e-05', 'ppl': '1.7', 'memory/max_active (GiB)': '75.17', 'memory/max_allocated (GiB)': '75.17', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '69.12', 'tokens/total': 439836832, 'tokens/trainable': 162797584, 'epoch': '0.3976'}
 13%|███████████▋                                                                            | 232/1751 [3:55:07<25:04:25, 59.42s/it] 13%|███████████▋                                                                            | 233/1751 [3:56:04<24:45:59, 58.73s/it]                                                                                                                                     {'loss': '0.565', 'grad_norm': '0.2129', 'learning_rate': '1.963e-05', 'ppl': '1.759', 'memory/max_active (GiB)': '73.64', 'memory/max_allocated (GiB)': '73.64', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '83.23', 'tokens/total': 441659200, 'tokens/trainable': 163449488, 'epoch': '0.3993'}
 13%|███████████▋                                                                            | 233/1751 [3:56:04<24:45:59, 58.73s/it] 13%|███████████▊                                                                            | 234/1751 [3:57:06<25:08:51, 59.68s/it]                                                                                                                                     {'loss': '0.5198', 'grad_norm': '0.1973', 'learning_rate': '1.962e-05', 'ppl': '1.682', 'memory/max_active (GiB)': '72.8', 'memory/max_allocated (GiB)': '72.8', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '110.2', 'tokens/total': 443669632, 'tokens/trainable': 164170400, 'epoch': '0.401'}
 13%|███████████▊                                                                            | 234/1751 [3:57:06<25:08:51, 59.68s/it] 13%|███████████▊                                                                            | 235/1751 [3:58:07<25:15:34, 59.98s/it]                                                                                                                                     {'loss': '0.5502', 'grad_norm': '0.2471', 'learning_rate': '1.962e-05', 'ppl': '1.734', 'memory/max_active (GiB)': '74.56', 'memory/max_allocated (GiB)': '74.56', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '57.04', 'tokens/total': 445593632, 'tokens/trainable': 164872032, 'epoch': '0.4027'}
 13%|███████████▊                                                                            | 235/1751 [3:58:07<25:15:34, 59.98s/it] 13%|███████████▊                                                                            | 236/1751 [3:59:09<25:32:25, 60.69s/it]                                                                                                                                     {'loss': '0.5429', 'grad_norm': '0.2031', 'learning_rate': '1.961e-05', 'ppl': '1.721', 'memory/max_active (GiB)': '72.68', 'memory/max_allocated (GiB)': '72.68', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '101', 'tokens/total': 447564480, 'tokens/trainable': 165592432, 'epoch': '0.4044'}
 13%|███████████▊                                                                            | 236/1751 [3:59:09<25:32:25, 60.69s/it] 14%|███████████▉                                                                            | 237/1751 [4:00:10<25:33:33, 60.78s/it]                                                                                                                                     {'loss': '0.51', 'grad_norm': '0.2012', 'learning_rate': '1.961e-05', 'ppl': '1.665', 'memory/max_active (GiB)': '76.33', 'memory/max_allocated (GiB)': '76.33', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '95.11', 'tokens/total': 449536704, 'tokens/trainable': 166331728, 'epoch': '0.4061'}
 14%|███████████▉                                                                            | 237/1751 [4:00:10<25:33:33, 60.78s/it] 14%|███████████▉                                                                            | 238/1751 [4:01:12<25:35:32, 60.89s/it]                                                                                                                                     {'loss': '0.5502', 'grad_norm': '0.207', 'learning_rate': '1.96e-05', 'ppl': '1.734', 'memory/max_active (GiB)': '74.52', 'memory/max_allocated (GiB)': '74.52', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '86.57', 'tokens/total': 451468224, 'tokens/trainable': 167025584, 'epoch': '0.4078'}
 14%|███████████▉                                                                            | 238/1751 [4:01:12<25:35:32, 60.89s/it] 14%|████████████                                                                            | 239/1751 [4:02:12<25:29:07, 60.68s/it]                                                                                                                                     {'loss': '0.5269', 'grad_norm': '0.2109', 'learning_rate': '1.96e-05', 'ppl': '1.694', 'memory/max_active (GiB)': '76.11', 'memory/max_allocated (GiB)': '76.11', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '113.1', 'tokens/total': 453407456, 'tokens/trainable': 167721392, 'epoch': '0.4095'}
 14%|████████████                                                                            | 239/1751 [4:02:12<25:29:07, 60.68s/it] 14%|████████████                                                                            | 240/1751 [4:03:11<25:17:13, 60.25s/it]                                                                                                                                     {'loss': '0.5516', 'grad_norm': '0.2207', 'learning_rate': '1.959e-05', 'ppl': '1.736', 'memory/max_active (GiB)': '73.2', 'memory/max_allocated (GiB)': '73.2', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '63.36', 'tokens/total': 455300352, 'tokens/trainable': 168403360, 'epoch': '0.4113'}
 14%|████████████                                                                            | 240/1751 [4:03:11<25:17:13, 60.25s/it] 14%|████████████                                                                            | 241/1751 [4:04:12<25:23:52, 60.55s/it]                                                                                                                                     {'loss': '0.5163', 'grad_norm': '0.1924', 'learning_rate': '1.959e-05', 'ppl': '1.676', 'memory/max_active (GiB)': '71.65', 'memory/max_allocated (GiB)': '71.65', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '36.94', 'tokens/total': 457251744, 'tokens/trainable': 169144128, 'epoch': '0.413'}
 14%|████████████                                                                            | 241/1751 [4:04:12<25:23:52, 60.55s/it] 14%|████████████▏                                                                           | 242/1751 [4:05:12<25:18:12, 60.37s/it]                                                                                                                                     {'loss': '0.5762', 'grad_norm': '0.2334', 'learning_rate': '1.958e-05', 'ppl': '1.779', 'memory/max_active (GiB)': '73.48', 'memory/max_allocated (GiB)': '73.48', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '89.64', 'tokens/total': 459136864, 'tokens/trainable': 169855200, 'epoch': '0.4147'}
 14%|████████████▏                                                                           | 242/1751 [4:05:12<25:18:12, 60.37s/it] 14%|████████████▏                                                                           | 243/1751 [4:06:12<25:10:30, 60.10s/it]                                                                                                                                     {'loss': '0.5096', 'grad_norm': '0.2246', 'learning_rate': '1.957e-05', 'ppl': '1.665', 'memory/max_active (GiB)': '70.1', 'memory/max_allocated (GiB)': '70.1', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '119.5', 'tokens/total': 461038688, 'tokens/trainable': 170527664, 'epoch': '0.4164'}
 14%|████████████▏                                                                           | 243/1751 [4:06:12<25:10:30, 60.10s/it] 14%|████████████▎                                                                           | 244/1751 [4:07:12<25:14:39, 60.31s/it]                                                                                                                                     {'loss': '0.5375', 'grad_norm': '0.2109', 'learning_rate': '1.957e-05', 'ppl': '1.712', 'memory/max_active (GiB)': '76.16', 'memory/max_allocated (GiB)': '76.16', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '142.8', 'tokens/total': 462970112, 'tokens/trainable': 171255744, 'epoch': '0.4181'}
 14%|████████████▎                                                                           | 244/1751 [4:07:12<25:14:39, 60.31s/it] 14%|████████████▎                                                                           | 245/1751 [4:08:15<25:28:34, 60.90s/it]                                                                                                                                     {'loss': '0.5441', 'grad_norm': '0.2354', 'learning_rate': '1.956e-05', 'ppl': '1.723', 'memory/max_active (GiB)': '74.8', 'memory/max_allocated (GiB)': '74.8', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '201.6', 'tokens/total': 465016352, 'tokens/trainable': 172002368, 'epoch': '0.4198'}
 14%|████████████▎                                                                           | 245/1751 [4:08:15<25:28:34, 60.90s/it] 14%|████████████▎                                                                           | 246/1751 [4:09:13<25:06:52, 60.08s/it]                                                                                                                                     {'loss': '0.5532', 'grad_norm': '0.2227', 'learning_rate': '1.956e-05', 'ppl': '1.739', 'memory/max_active (GiB)': '76.15', 'memory/max_allocated (GiB)': '76.15', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '74.37', 'tokens/total': 466832032, 'tokens/trainable': 172672656, 'epoch': '0.4215'}
 14%|████████████▎                                                                           | 246/1751 [4:09:13<25:06:52, 60.08s/it] 14%|████████████▍                                                                           | 247/1751 [4:10:10<24:40:23, 59.06s/it]                                                                                                                                     {'loss': '0.5787', 'grad_norm': '0.2148', 'learning_rate': '1.955e-05', 'ppl': '1.784', 'memory/max_active (GiB)': '75.79', 'memory/max_allocated (GiB)': '75.79', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '87.21', 'tokens/total': 468655040, 'tokens/trainable': 173338784, 'epoch': '0.4233'}
 14%|████████████▍                                                                           | 247/1751 [4:10:10<24:40:23, 59.06s/it] 14%|████████████▍                                                                           | 248/1751 [4:11:08<24:36:39, 58.95s/it]                                                                                                                                     {'loss': '0.5406', 'grad_norm': '0.2061', 'learning_rate': '1.955e-05', 'ppl': '1.717', 'memory/max_active (GiB)': '72.62', 'memory/max_allocated (GiB)': '72.62', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '113.6', 'tokens/total': 470542208, 'tokens/trainable': 174032192, 'epoch': '0.425'}
 14%|████████████▍                                                                           | 248/1751 [4:11:08<24:36:39, 58.95s/it] 14%|████████████▌                                                                           | 249/1751 [4:12:06<24:23:16, 58.45s/it]                                                                                                                                     {'loss': '0.5542', 'grad_norm': '0.2246', 'learning_rate': '1.954e-05', 'ppl': '1.741', 'memory/max_active (GiB)': '77.54', 'memory/max_allocated (GiB)': '77.54', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '102.4', 'tokens/total': 472375776, 'tokens/trainable': 174706816, 'epoch': '0.4267'}
 14%|████████████▌                                                                           | 249/1751 [4:12:06<24:23:16, 58.45s/it] 14%|████████████▌                                                                           | 250/1751 [4:13:06<24:40:34, 59.18s/it]                                                                                                                                     {'loss': '0.541', 'grad_norm': '0.2061', 'learning_rate': '1.954e-05', 'ppl': '1.718', 'memory/max_active (GiB)': '74.46', 'memory/max_allocated (GiB)': '74.46', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '95.59', 'tokens/total': 474365952, 'tokens/trainable': 175411456, 'epoch': '0.4284'}
 14%|████████████▌                                                                           | 250/1751 [4:13:06<24:40:34, 59.18s/it] 14%|████████████▌                                                                           | 251/1751 [4:14:06<24:40:32, 59.22s/it]                                                                                                                                     {'loss': '0.5611', 'grad_norm': '0.208', 'learning_rate': '1.953e-05', 'ppl': '1.753', 'memory/max_active (GiB)': '72.54', 'memory/max_allocated (GiB)': '72.54', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '28.73', 'tokens/total': 476220992, 'tokens/trainable': 176113392, 'epoch': '0.4301'}
 14%|████████████▌                                                                           | 251/1751 [4:14:06<24:40:32, 59.22s/it] 14%|████████████▋                                                                           | 252/1751 [4:15:06<24:46:04, 59.48s/it]                                                                                                                                     {'loss': '0.5307', 'grad_norm': '0.1885', 'learning_rate': '1.952e-05', 'ppl': '1.7', 'memory/max_active (GiB)': '76.25', 'memory/max_allocated (GiB)': '76.25', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '65.61', 'tokens/total': 478132416, 'tokens/trainable': 176809168, 'epoch': '0.4318'}
 14%|████████████▋                                                                           | 252/1751 [4:15:06<24:46:04, 59.48s/it] 14%|████████████▋                                                                           | 253/1751 [4:16:06<24:50:59, 59.72s/it]                                                                                                                                     {'loss': '0.5442', 'grad_norm': '0.2197', 'learning_rate': '1.952e-05', 'ppl': '1.723', 'memory/max_active (GiB)': '73.86', 'memory/max_allocated (GiB)': '73.86', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '126.4', 'tokens/total': 480072224, 'tokens/trainable': 177496832, 'epoch': '0.4335'}
 14%|████████████▋                                                                           | 253/1751 [4:16:06<24:50:59, 59.72s/it] 15%|████████████▊                                                                           | 254/1751 [4:17:05<24:40:20, 59.33s/it]                                                                                                                                     {'loss': '0.6008', 'grad_norm': '0.2256', 'learning_rate': '1.951e-05', 'ppl': '1.824', 'memory/max_active (GiB)': '77.57', 'memory/max_allocated (GiB)': '77.57', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '68.18', 'tokens/total': 481881792, 'tokens/trainable': 178134528, 'epoch': '0.4353'}
 15%|████████████▊                                                                           | 254/1751 [4:17:05<24:40:20, 59.33s/it] 15%|████████████▊                                                                           | 255/1751 [4:18:04<24:41:17, 59.41s/it]                                                                                                                                     {'loss': '0.4974', 'grad_norm': '0.1982', 'learning_rate': '1.951e-05', 'ppl': '1.644', 'memory/max_active (GiB)': '74.92', 'memory/max_allocated (GiB)': '74.92', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '108.6', 'tokens/total': 483769472, 'tokens/trainable': 178823424, 'epoch': '0.437'}
 15%|████████████▊                                                                           | 255/1751 [4:18:04<24:41:17, 59.41s/it] 15%|████████████▊                                                                           | 256/1751 [4:19:05<24:51:37, 59.86s/it]                                                                                                                                     {'loss': '0.5623', 'grad_norm': '0.208', 'learning_rate': '1.95e-05', 'ppl': '1.755', 'memory/max_active (GiB)': '75.18', 'memory/max_allocated (GiB)': '75.18', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '80.25', 'tokens/total': 485686080, 'tokens/trainable': 179556000, 'epoch': '0.4387'}
 15%|████████████▊                                                                           | 256/1751 [4:19:05<24:51:37, 59.86s/it] 15%|████████████▉                                                                           | 257/1751 [4:20:04<24:44:47, 59.63s/it]                                                                                                                                     {'loss': '0.5419', 'grad_norm': '0.2031', 'learning_rate': '1.95e-05', 'ppl': '1.719', 'memory/max_active (GiB)': '71.85', 'memory/max_allocated (GiB)': '71.85', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '92.34', 'tokens/total': 487580224, 'tokens/trainable': 180277088, 'epoch': '0.4404'}
 15%|████████████▉                                                                           | 257/1751 [4:20:04<24:44:47, 59.63s/it] 15%|████████████▉                                                                           | 258/1751 [4:21:03<24:38:42, 59.43s/it]                                                                                                                                     {'loss': '0.5606', 'grad_norm': '0.2236', 'learning_rate': '1.949e-05', 'ppl': '1.752', 'memory/max_active (GiB)': '75.31', 'memory/max_allocated (GiB)': '75.31', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '95.93', 'tokens/total': 489433056, 'tokens/trainable': 180964608, 'epoch': '0.4421'}
 15%|████████████▉                                                                           | 258/1751 [4:21:03<24:38:42, 59.43s/it] 15%|█████████████                                                                           | 259/1751 [4:22:01<24:23:54, 58.87s/it]                                                                                                                                     {'loss': '0.5446', 'grad_norm': '0.2109', 'learning_rate': '1.948e-05', 'ppl': '1.724', 'memory/max_active (GiB)': '73.57', 'memory/max_allocated (GiB)': '73.57', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '92.23', 'tokens/total': 491229568, 'tokens/trainable': 181607744, 'epoch': '0.4438'}
 15%|█████████████                                                                           | 259/1751 [4:22:01<24:23:54, 58.87s/it] 15%|█████████████                                                                           | 260/1751 [4:23:00<24:29:27, 59.13s/it]                                                                                                                                     {'loss': '0.5721', 'grad_norm': '0.2041', 'learning_rate': '1.948e-05', 'ppl': '1.772', 'memory/max_active (GiB)': '73.93', 'memory/max_allocated (GiB)': '73.93', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '81.46', 'tokens/total': 493101952, 'tokens/trainable': 182316368, 'epoch': '0.4455'}
 15%|█████████████                                                                           | 260/1751 [4:23:00<24:29:27, 59.13s/it] 15%|█████████████                                                                           | 261/1751 [4:24:01<24:37:29, 59.50s/it]                                                                                                                                     {'loss': '0.53', 'grad_norm': '0.2051', 'learning_rate': '1.947e-05', 'ppl': '1.699', 'memory/max_active (GiB)': '74.8', 'memory/max_allocated (GiB)': '74.8', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '249.9', 'tokens/total': 494998528, 'tokens/trainable': 183040640, 'epoch': '0.4472'}
 15%|█████████████                                                                           | 261/1751 [4:24:01<24:37:29, 59.50s/it] 15%|█████████████▏                                                                          | 262/1751 [4:25:01<24:42:42, 59.75s/it]                                                                                                                                     {'loss': '0.5562', 'grad_norm': '0.209', 'learning_rate': '1.947e-05', 'ppl': '1.744', 'memory/max_active (GiB)': '76.89', 'memory/max_allocated (GiB)': '76.89', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '82.35', 'tokens/total': 496922400, 'tokens/trainable': 183746464, 'epoch': '0.449'}
 15%|█████████████▏                                                                          | 262/1751 [4:25:01<24:42:42, 59.75s/it] 15%|█████████████▏                                                                          | 263/1751 [4:26:04<25:05:41, 60.71s/it]                                                                                                                                     {'loss': '0.5261', 'grad_norm': '0.21', 'learning_rate': '1.946e-05', 'ppl': '1.692', 'memory/max_active (GiB)': '74.9', 'memory/max_allocated (GiB)': '74.9', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '55.99', 'tokens/total': 498971040, 'tokens/trainable': 184524032, 'epoch': '0.4507'}
 15%|█████████████▏                                                                          | 263/1751 [4:26:04<25:05:41, 60.71s/it] 15%|█████████████▎                                                                          | 264/1751 [4:27:04<24:57:34, 60.43s/it]                                                                                                                                     {'loss': '0.5381', 'grad_norm': '0.2168', 'learning_rate': '1.945e-05', 'ppl': '1.713', 'memory/max_active (GiB)': '74.1', 'memory/max_allocated (GiB)': '74.1', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '49.38', 'tokens/total': 500862272, 'tokens/trainable': 185226352, 'epoch': '0.4524'}
 15%|█████████████▎                                                                          | 264/1751 [4:27:04<24:57:34, 60.43s/it] 15%|█████████████▎                                                                          | 265/1751 [4:28:02<24:43:04, 59.88s/it]                                                                                                                                     {'loss': '0.5402', 'grad_norm': '0.2285', 'learning_rate': '1.945e-05', 'ppl': '1.716', 'memory/max_active (GiB)': '76.63', 'memory/max_allocated (GiB)': '76.63', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '36.94', 'tokens/total': 502660288, 'tokens/trainable': 185869888, 'epoch': '0.4541'}
 15%|█████████████▎                                                                          | 265/1751 [4:28:02<24:43:04, 59.88s/it] 15%|█████████████▎                                                                          | 266/1751 [4:29:03<24:45:59, 60.04s/it]                                                                                                                                     {'loss': '0.5661', 'grad_norm': '0.2217', 'learning_rate': '1.944e-05', 'ppl': '1.761', 'memory/max_active (GiB)': '74.94', 'memory/max_allocated (GiB)': '74.94', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '41.39', 'tokens/total': 504581984, 'tokens/trainable': 186588352, 'epoch': '0.4558'}
 15%|█████████████▎                                                                          | 266/1751 [4:29:03<24:45:59, 60.04s/it] 15%|█████████████▍                                                                          | 267/1751 [4:30:04<24:53:00, 60.36s/it]                                                                                                                                     {'loss': '0.577', 'grad_norm': '0.2266', 'learning_rate': '1.943e-05', 'ppl': '1.781', 'memory/max_active (GiB)': '75.28', 'memory/max_allocated (GiB)': '75.28', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '31.09', 'tokens/total': 506488768, 'tokens/trainable': 187295200, 'epoch': '0.4575'}
 15%|█████████████▍                                                                          | 267/1751 [4:30:04<24:53:00, 60.36s/it] 15%|█████████████▍                                                                          | 268/1751 [4:31:05<24:59:02, 60.65s/it]                                                                                                                                     {'loss': '0.5359', 'grad_norm': '0.2432', 'learning_rate': '1.943e-05', 'ppl': '1.709', 'memory/max_active (GiB)': '74.36', 'memory/max_allocated (GiB)': '74.36', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '43.24', 'tokens/total': 508449504, 'tokens/trainable': 188014816, 'epoch': '0.4592'}
 15%|█████████████▍                                                                          | 268/1751 [4:31:05<24:59:02, 60.65s/it] 15%|█████████████▌                                                                          | 269/1751 [4:32:06<24:56:35, 60.59s/it]                                                                                                                                     {'loss': '0.5431', 'grad_norm': '0.2158', 'learning_rate': '1.942e-05', 'ppl': '1.721', 'memory/max_active (GiB)': '75.47', 'memory/max_allocated (GiB)': '75.47', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '59.71', 'tokens/total': 510359328, 'tokens/trainable': 188701872, 'epoch': '0.461'}
 15%|█████████████▌                                                                          | 269/1751 [4:32:06<24:56:35, 60.59s/it] 15%|█████████████▌                                                                          | 270/1751 [4:33:03<24:28:54, 59.51s/it]                                                                                                                                     {'loss': '0.5866', 'grad_norm': '0.2295', 'learning_rate': '1.942e-05', 'ppl': '1.798', 'memory/max_active (GiB)': '69.57', 'memory/max_allocated (GiB)': '69.57', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '147.6', 'tokens/total': 512128352, 'tokens/trainable': 189360784, 'epoch': '0.4627'}
 15%|█████████████▌                                                                          | 270/1751 [4:33:03<24:28:54, 59.51s/it] 15%|█████████████▌                                                                          | 271/1751 [4:34:02<24:25:17, 59.40s/it]                                                                                                                                     {'loss': '0.5753', 'grad_norm': '0.2383', 'learning_rate': '1.941e-05', 'ppl': '1.778', 'memory/max_active (GiB)': '75.87', 'memory/max_allocated (GiB)': '75.87', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '21.84', 'tokens/total': 514031424, 'tokens/trainable': 190050384, 'epoch': '0.4644'}
 15%|█████████████▌                                                                          | 271/1751 [4:34:02<24:25:17, 59.40s/it] 16%|█████████████▋                                                                          | 272/1751 [4:34:59<24:07:35, 58.73s/it]                                                                                                                                     {'loss': '0.5949', 'grad_norm': '0.2148', 'learning_rate': '1.94e-05', 'ppl': '1.813', 'memory/max_active (GiB)': '70.96', 'memory/max_allocated (GiB)': '70.96', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '99.77', 'tokens/total': 515777184, 'tokens/trainable': 190694096, 'epoch': '0.4661'}
 16%|█████████████▋                                                                          | 272/1751 [4:34:59<24:07:35, 58.73s/it] 16%|█████████████▋                                                                          | 273/1751 [4:35:58<24:11:21, 58.92s/it]                                                                                                                                     {'loss': '0.5665', 'grad_norm': '0.2217', 'learning_rate': '1.94e-05', 'ppl': '1.762', 'memory/max_active (GiB)': '71.55', 'memory/max_allocated (GiB)': '71.55', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '138.5', 'tokens/total': 517653184, 'tokens/trainable': 191367216, 'epoch': '0.4678'}
 16%|█████████████▋                                                                          | 273/1751 [4:35:58<24:11:21, 58.92s/it] 16%|█████████████▊                                                                          | 274/1751 [4:37:02<24:45:10, 60.33s/it]                                                                                                                                     {'loss': '0.4805', 'grad_norm': '0.1875', 'learning_rate': '1.939e-05', 'ppl': '1.617', 'memory/max_active (GiB)': '77.77', 'memory/max_allocated (GiB)': '77.77', 'memory/device_reserved (GiB)': '95.88', 'tokens/train_per_sec_per_gpu': '154.6', 'tokens/total': 519702848, 'tokens/trainable': 192105104, 'epoch': '0.4695'}
 16%|█████████████▊                                                                          | 274/1751 [4:37:02<24:45:10, 60.33s/it] 16%|█████████████▊                                                                          | 275/1751 [4:38:02<24:45:27, 60.38s/it]                                                                                                                                     {'loss': '0.4961', 'grad_norm': '0.208', 'learning_rate': '1.938e-05', 'ppl': '1.642', 'memory/max_active (GiB)': '76.23', 'memory/max_allocated (GiB)': '76.23', 'memory/device_reserved (GiB)': '97.62', 'tokens/train_per_sec_per_gpu': '48.46', 'tokens/total': 521642080, 'tokens/trainable': 192826992, 'epoch': '0.4712'}
 16%|█████████████▊                                                                          | 275/1751 [4:38:02<24:45:27, 60.38s/it] 16%|█████████████▊                                                                          | 276/1751 [4:39:06<25:10:10, 61.43s/it]                                                                                                                                     {'loss': '0.522', 'grad_norm': '0.208', 'learning_rate': '1.938e-05', 'ppl': '1.685', 'memory/max_active (GiB)': '74.95', 'memory/max_allocated (GiB)': '74.95', 'memory/device_reserved (GiB)': '97.62', 'tokens/train_per_sec_per_gpu': '77.66', 'tokens/total': 523724640, 'tokens/trainable': 193607712, 'epoch': '0.473'}
 16%|█████████████▊                                                                          | 276/1751 [4:39:06<25:10:10, 61.43s/it] 16%|█████████████▉                                                                          | 277/1751 [4:40:06<24:54:54, 60.85s/it]                                                                                                                                     {'loss': '0.5739', 'grad_norm': '0.2109', 'learning_rate': '1.937e-05', 'ppl': '1.775', 'memory/max_active (GiB)': '74.08', 'memory/max_allocated (GiB)': '74.08', 'memory/device_reserved (GiB)': '97.62', 'tokens/train_per_sec_per_gpu': '109.3', 'tokens/total': 525561952, 'tokens/trainable': 194307680, 'epoch': '0.4747'}
 16%|█████████████▉                                                                          | 277/1751 [4:40:06<24:54:54, 60.85s/it] 16%|█████████████▉                                                                          | 278/1751 [4:41:04<24:31:26, 59.94s/it]                                                                                                                                     {'loss': '0.5624', 'grad_norm': '0.249', 'learning_rate': '1.936e-05', 'ppl': '1.755', 'memory/max_active (GiB)': '75.15', 'memory/max_allocated (GiB)': '75.15', 'memory/device_reserved (GiB)': '97.62', 'tokens/train_per_sec_per_gpu': '98.53', 'tokens/total': 527374176, 'tokens/trainable': 194950672, 'epoch': '0.4764'}
 16%|█████████████▉                                                                          | 278/1751 [4:41:04<24:31:26, 59.94s/it] 16%|██████████████                                                                          | 279/1751 [4:42:03<24:26:39, 59.78s/it]                                                                                                                                     {'loss': '0.5475', 'grad_norm': '0.2217', 'learning_rate': '1.936e-05', 'ppl': '1.729', 'memory/max_active (GiB)': '73.38', 'memory/max_allocated (GiB)': '73.38', 'memory/device_reserved (GiB)': '97.62', 'tokens/train_per_sec_per_gpu': '94.15', 'tokens/total': 529247712, 'tokens/trainable': 195614704, 'epoch': '0.4781'}
 16%|██████████████                                                                          | 279/1751 [4:42:03<24:26:39, 59.78s/it] 16%|██████████████                                                                          | 280/1751 [4:43:01<24:14:08, 59.31s/it]                                                                                                                                     {'loss': '0.5509', 'grad_norm': '0.2158', 'learning_rate': '1.935e-05', 'ppl': '1.735', 'memory/max_active (GiB)': '73.65', 'memory/max_allocated (GiB)': '73.65', 'memory/device_reserved (GiB)': '97.62', 'tokens/train_per_sec_per_gpu': '59.54', 'tokens/total': 531063424, 'tokens/trainable': 196292064, 'epoch': '0.4798'}
 16%|██████████████                                                                          | 280/1751 [4:43:01<24:14:08, 59.31s/it] 16%|██████████████                                                                          | 281/1751 [4:44:04<24:36:04, 60.25s/it]                                                                                                                                     {'loss': '0.5395', 'grad_norm': '0.1963', 'learning_rate': '1.934e-05', 'ppl': '1.715', 'memory/max_active (GiB)': '74.73', 'memory/max_allocated (GiB)': '74.73', 'memory/device_reserved (GiB)': '97.62', 'tokens/train_per_sec_per_gpu': '46.77', 'tokens/total': 533064256, 'tokens/trainable': 197042720, 'epoch': '0.4815'}
 16%|██████████████                                                                          | 281/1751 [4:44:04<24:36:04, 60.25s/it] 16%|██████████████▏                                                                         | 282/1751 [4:45:03<24:26:37, 59.90s/it]                                                                                                                                     {'loss': '0.5447', 'grad_norm': '0.2236', 'learning_rate': '1.934e-05', 'ppl': '1.724', 'memory/max_active (GiB)': '71.86', 'memory/max_allocated (GiB)': '71.86', 'memory/device_reserved (GiB)': '97.62', 'tokens/train_per_sec_per_gpu': '141.7', 'tokens/total': 534930592, 'tokens/trainable': 197718160, 'epoch': '0.4832'}
 16%|██████████████▏                                                                         | 282/1751 [4:45:03<24:26:37, 59.90s/it] 16%|██████████████▏                                                                         | 283/1751 [4:46:04<24:38:00, 60.41s/it]                                                                                                                                     {'loss': '0.5296', 'grad_norm': '0.209', 'learning_rate': '1.933e-05', 'ppl': '1.698', 'memory/max_active (GiB)': '72.5', 'memory/max_allocated (GiB)': '72.5', 'memory/device_reserved (GiB)': '97.62', 'tokens/train_per_sec_per_gpu': '104.6', 'tokens/total': 536910144, 'tokens/trainable': 198430624, 'epoch': '0.4849'}
 16%|██████████████▏                                                                         | 283/1751 [4:46:04<24:38:00, 60.41s/it] 16%|██████████████▎                                                                         | 284/1751 [4:47:02<24:15:40, 59.54s/it]                                                                                                                                     {'loss': '0.5225', 'grad_norm': '0.2305', 'learning_rate': '1.932e-05', 'ppl': '1.686', 'memory/max_active (GiB)': '75.53', 'memory/max_allocated (GiB)': '75.53', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '71.87', 'tokens/total': 538724288, 'tokens/trainable': 199078064, 'epoch': '0.4867'}
 16%|██████████████▎                                                                         | 284/1751 [4:47:02<24:15:40, 59.54s/it] 16%|██████████████▎                                                                         | 285/1751 [4:48:02<24:16:47, 59.62s/it]                                                                                                                                     {'loss': '0.5801', 'grad_norm': '0.2295', 'learning_rate': '1.932e-05', 'ppl': '1.786', 'memory/max_active (GiB)': '75.72', 'memory/max_allocated (GiB)': '75.72', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '55.89', 'tokens/total': 540594816, 'tokens/trainable': 199806080, 'epoch': '0.4884'}
 16%|██████████████▎                                                                         | 285/1751 [4:48:02<24:16:47, 59.62s/it] 16%|██████████████▎                                                                         | 286/1751 [4:49:04<24:35:40, 60.44s/it]                                                                                                                                     {'loss': '0.5449', 'grad_norm': '0.2051', 'learning_rate': '1.931e-05', 'ppl': '1.724', 'memory/max_active (GiB)': '74.49', 'memory/max_allocated (GiB)': '74.49', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '167.2', 'tokens/total': 542612544, 'tokens/trainable': 200552832, 'epoch': '0.4901'}
 16%|██████████████▎                                                                         | 286/1751 [4:49:04<24:35:40, 60.44s/it] 16%|██████████████▍                                                                         | 287/1751 [4:50:03<24:25:28, 60.06s/it]                                                                                                                                     {'loss': '0.5432', 'grad_norm': '0.2109', 'learning_rate': '1.93e-05', 'ppl': '1.722', 'memory/max_active (GiB)': '76.48', 'memory/max_allocated (GiB)': '76.48', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '150.4', 'tokens/total': 544513152, 'tokens/trainable': 201252432, 'epoch': '0.4918'}
 16%|██████████████▍                                                                         | 287/1751 [4:50:03<24:25:28, 60.06s/it] 16%|██████████████▍                                                                         | 288/1751 [4:51:03<24:20:18, 59.89s/it]                                                                                                                                     {'loss': '0.5388', 'grad_norm': '0.1934', 'learning_rate': '1.93e-05', 'ppl': '1.714', 'memory/max_active (GiB)': '70.64', 'memory/max_allocated (GiB)': '70.64', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '107.9', 'tokens/total': 546436224, 'tokens/trainable': 201960560, 'epoch': '0.4935'}
 16%|██████████████▍                                                                         | 288/1751 [4:51:03<24:20:18, 59.89s/it] 17%|██████████████▌                                                                         | 289/1751 [4:52:01<24:05:58, 59.34s/it]                                                                                                                                     {'loss': '0.5417', 'grad_norm': '0.208', 'learning_rate': '1.929e-05', 'ppl': '1.719', 'memory/max_active (GiB)': '75.01', 'memory/max_allocated (GiB)': '75.01', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '28.75', 'tokens/total': 548293248, 'tokens/trainable': 202633104, 'epoch': '0.4952'}
 17%|██████████████▌                                                                         | 289/1751 [4:52:01<24:05:58, 59.34s/it] 17%|██████████████▌                                                                         | 290/1751 [4:53:01<24:09:09, 59.51s/it]                                                                                                                                     {'loss': '0.5046', 'grad_norm': '0.1914', 'learning_rate': '1.928e-05', 'ppl': '1.656', 'memory/max_active (GiB)': '75.62', 'memory/max_allocated (GiB)': '75.62', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '29.28', 'tokens/total': 550195008, 'tokens/trainable': 203315344, 'epoch': '0.4969'}
 17%|██████████████▌                                                                         | 290/1751 [4:53:01<24:09:09, 59.51s/it] 17%|██████████████▌                                                                         | 291/1751 [4:54:02<24:22:36, 60.11s/it]                                                                                                                                     {'loss': '0.5379', 'grad_norm': '0.2061', 'learning_rate': '1.927e-05', 'ppl': '1.712', 'memory/max_active (GiB)': '75.18', 'memory/max_allocated (GiB)': '75.18', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '181', 'tokens/total': 552189440, 'tokens/trainable': 204048288, 'epoch': '0.4987'}
 17%|██████████████▌                                                                         | 291/1751 [4:54:02<24:22:36, 60.11s/it] 17%|██████████████▋                                                                         | 292/1751 [4:55:00<24:05:12, 59.43s/it]                                                                                                                                     {'loss': '0.5902', 'grad_norm': '0.209', 'learning_rate': '1.927e-05', 'ppl': '1.804', 'memory/max_active (GiB)': '69.56', 'memory/max_allocated (GiB)': '69.56', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '190.3', 'tokens/total': 554037632, 'tokens/trainable': 204719280, 'epoch': '0.5004'}
 17%|██████████████▋                                                                         | 292/1751 [4:55:00<24:05:12, 59.43s/it] 17%|██████████████▋                                                                         | 293/1751 [4:56:00<24:10:13, 59.68s/it]                                                                                                                                     {'loss': '0.5649', 'grad_norm': '0.2041', 'learning_rate': '1.926e-05', 'ppl': '1.759', 'memory/max_active (GiB)': '69.61', 'memory/max_allocated (GiB)': '69.61', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '68.7', 'tokens/total': 555924608, 'tokens/trainable': 205439568, 'epoch': '0.5021'}
 17%|██████████████▋                                                                         | 293/1751 [4:56:00<24:10:13, 59.68s/it] 17%|██████████████▊                                                                         | 294/1751 [4:57:00<24:07:57, 59.63s/it]                                                                                                                                     {'loss': '0.5448', 'grad_norm': '0.2061', 'learning_rate': '1.925e-05', 'ppl': '1.724', 'memory/max_active (GiB)': '74.61', 'memory/max_allocated (GiB)': '74.61', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '38.55', 'tokens/total': 557795648, 'tokens/trainable': 206140656, 'epoch': '0.5038'}
 17%|██████████████▊                                                                         | 294/1751 [4:57:00<24:07:57, 59.63s/it] 17%|██████████████▊                                                                         | 295/1751 [4:57:58<23:59:22, 59.32s/it]                                                                                                                                     {'loss': '0.543', 'grad_norm': '0.207', 'learning_rate': '1.925e-05', 'ppl': '1.721', 'memory/max_active (GiB)': '72.57', 'memory/max_allocated (GiB)': '72.57', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '91.29', 'tokens/total': 559649216, 'tokens/trainable': 206822592, 'epoch': '0.5055'}
 17%|██████████████▊                                                                         | 295/1751 [4:57:58<23:59:22, 59.32s/it] 17%|██████████████▉                                                                         | 296/1751 [4:58:58<24:03:43, 59.54s/it]                                                                                                                                     {'loss': '0.5911', 'grad_norm': '0.2168', 'learning_rate': '1.924e-05', 'ppl': '1.806', 'memory/max_active (GiB)': '76.12', 'memory/max_allocated (GiB)': '76.12', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '42.78', 'tokens/total': 561559872, 'tokens/trainable': 207542240, 'epoch': '0.5072'}
 17%|██████████████▉                                                                         | 296/1751 [4:58:58<24:03:43, 59.54s/it] 17%|██████████████▉                                                                         | 297/1751 [4:59:58<24:03:26, 59.56s/it]                                                                                                                                     {'loss': '0.5558', 'grad_norm': '0.209', 'learning_rate': '1.923e-05', 'ppl': '1.743', 'memory/max_active (GiB)': '72.59', 'memory/max_allocated (GiB)': '72.59', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '42.09', 'tokens/total': 563418432, 'tokens/trainable': 208227840, 'epoch': '0.5089'}
 17%|██████████████▉                                                                         | 297/1751 [4:59:58<24:03:26, 59.56s/it] 17%|██████████████▉                                                                         | 298/1751 [5:00:57<23:56:43, 59.33s/it]                                                                                                                                     {'loss': '0.5702', 'grad_norm': '0.2334', 'learning_rate': '1.922e-05', 'ppl': '1.769', 'memory/max_active (GiB)': '68.22', 'memory/max_allocated (GiB)': '68.22', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '75.02', 'tokens/total': 565275968, 'tokens/trainable': 208905616, 'epoch': '0.5106'}
 17%|██████████████▉                                                                         | 298/1751 [5:00:57<23:56:43, 59.33s/it] 17%|███████████████                                                                         | 299/1751 [5:01:59<24:12:48, 60.03s/it]                                                                                                                                     {'loss': '0.4966', 'grad_norm': '0.2119', 'learning_rate': '1.922e-05', 'ppl': '1.643', 'memory/max_active (GiB)': '76.96', 'memory/max_allocated (GiB)': '76.96', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '96.24', 'tokens/total': 567269056, 'tokens/trainable': 209638880, 'epoch': '0.5124'}
 17%|███████████████                                                                         | 299/1751 [5:01:59<24:12:48, 60.03s/it] 17%|███████████████                                                                         | 300/1751 [5:02:55<23:48:29, 59.07s/it]                                                                                                                                     {'loss': '0.586', 'grad_norm': '0.2178', 'learning_rate': '1.921e-05', 'ppl': '1.797', 'memory/max_active (GiB)': '76.88', 'memory/max_allocated (GiB)': '76.88', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '59.77', 'tokens/total': 569066816, 'tokens/trainable': 210277552, 'epoch': '0.5141'}
 17%|███████████████                                                                         | 300/1751 [5:02:55<23:48:29, 59.07s/it] 17%|███████████████▏                                                                        | 301/1751 [5:03:54<23:46:21, 59.02s/it]                                                                                                                                     {'loss': '0.569', 'grad_norm': '0.2207', 'learning_rate': '1.92e-05', 'ppl': '1.767', 'memory/max_active (GiB)': '77.06', 'memory/max_allocated (GiB)': '77.06', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '65.53', 'tokens/total': 570936128, 'tokens/trainable': 210966304, 'epoch': '0.5158'}
 17%|███████████████▏                                                                        | 301/1751 [5:03:54<23:46:21, 59.02s/it] 17%|███████████████▏                                                                        | 302/1751 [5:04:55<23:59:39, 59.61s/it]                                                                                                                                     {'loss': '0.5285', 'grad_norm': '0.252', 'learning_rate': '1.919e-05', 'ppl': '1.696', 'memory/max_active (GiB)': '74.7', 'memory/max_allocated (GiB)': '74.7', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '110.8', 'tokens/total': 572833472, 'tokens/trainable': 211690800, 'epoch': '0.5175'}
 17%|███████████████▏                                                                        | 302/1751 [5:04:55<23:59:39, 59.61s/it] 17%|███████████████▏                                                                        | 303/1751 [5:05:55<24:01:33, 59.73s/it]                                                                                                                                     {'loss': '0.5596', 'grad_norm': '0.2188', 'learning_rate': '1.919e-05', 'ppl': '1.75', 'memory/max_active (GiB)': '72.84', 'memory/max_allocated (GiB)': '72.84', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '53.19', 'tokens/total': 574715136, 'tokens/trainable': 212387056, 'epoch': '0.5192'}
 17%|███████████████▏                                                                        | 303/1751 [5:05:55<24:01:33, 59.73s/it] 17%|███████████████▎                                                                        | 304/1751 [5:06:56<24:05:35, 59.94s/it]                                                                                                                                     {'loss': '0.5271', 'grad_norm': '0.2109', 'learning_rate': '1.918e-05', 'ppl': '1.694', 'memory/max_active (GiB)': '75.86', 'memory/max_allocated (GiB)': '75.86', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '69.28', 'tokens/total': 576627776, 'tokens/trainable': 213097696, 'epoch': '0.5209'}
 17%|███████████████▎                                                                        | 304/1751 [5:06:56<24:05:35, 59.94s/it] 17%|███████████████▎                                                                        | 305/1751 [5:07:53<23:41:40, 58.99s/it]                                                                                                                                     {'loss': '0.5421', 'grad_norm': '0.2227', 'learning_rate': '1.917e-05', 'ppl': '1.72', 'memory/max_active (GiB)': '77.7', 'memory/max_allocated (GiB)': '77.7', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '163.3', 'tokens/total': 578392320, 'tokens/trainable': 213749856, 'epoch': '0.5226'}
 17%|███████████████▎                                                                        | 305/1751 [5:07:53<23:41:40, 58.99s/it] 17%|███████████████▍                                                                        | 306/1751 [5:08:54<23:58:29, 59.73s/it]                                                                                                                                     {'loss': '0.5465', 'grad_norm': '0.1914', 'learning_rate': '1.916e-05', 'ppl': '1.727', 'memory/max_active (GiB)': '72.33', 'memory/max_allocated (GiB)': '72.33', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '43.54', 'tokens/total': 580369152, 'tokens/trainable': 214491168, 'epoch': '0.5244'}
 17%|███████████████▍                                                                        | 306/1751 [5:08:54<23:58:29, 59.73s/it] 18%|███████████████▍                                                                        | 307/1751 [5:09:51<23:41:03, 59.05s/it]                                                                                                                                     {'loss': '0.6025', 'grad_norm': '0.2793', 'learning_rate': '1.916e-05', 'ppl': '1.827', 'memory/max_active (GiB)': '71.57', 'memory/max_allocated (GiB)': '71.57', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '89.64', 'tokens/total': 582177728, 'tokens/trainable': 215117808, 'epoch': '0.5261'}
 18%|███████████████▍                                                                        | 307/1751 [5:09:51<23:41:03, 59.05s/it] 18%|███████████████▍                                                                        | 308/1751 [5:10:50<23:33:58, 58.79s/it]                                                                                                                                     {'loss': '0.5191', 'grad_norm': '0.2295', 'learning_rate': '1.915e-05', 'ppl': '1.681', 'memory/max_active (GiB)': '73.35', 'memory/max_allocated (GiB)': '73.35', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '118.2', 'tokens/total': 584004224, 'tokens/trainable': 215798480, 'epoch': '0.5278'}
 18%|███████████████▍                                                                        | 308/1751 [5:10:50<23:33:58, 58.79s/it] 18%|███████████████▌                                                                        | 309/1751 [5:11:51<23:51:58, 59.58s/it]                                                                                                                                     {'loss': '0.5126', 'grad_norm': '0.207', 'learning_rate': '1.914e-05', 'ppl': '1.67', 'memory/max_active (GiB)': '76.89', 'memory/max_allocated (GiB)': '76.89', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '135.7', 'tokens/total': 585986112, 'tokens/trainable': 216505696, 'epoch': '0.5295'}
 18%|███████████████▌                                                                        | 309/1751 [5:11:51<23:51:58, 59.58s/it] 18%|███████████████▌                                                                        | 310/1751 [5:12:49<23:41:27, 59.19s/it]                                                                                                                                     {'loss': '0.5132', 'grad_norm': '0.2314', 'learning_rate': '1.913e-05', 'ppl': '1.671', 'memory/max_active (GiB)': '74.15', 'memory/max_allocated (GiB)': '74.15', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '115.8', 'tokens/total': 587810624, 'tokens/trainable': 217181920, 'epoch': '0.5312'}
 18%|███████████████▌                                                                        | 310/1751 [5:12:49<23:41:27, 59.19s/it] 18%|███████████████▋                                                                        | 311/1751 [5:13:50<23:52:09, 59.67s/it]                                                                                                                                     {'loss': '0.5279', 'grad_norm': '0.2119', 'learning_rate': '1.913e-05', 'ppl': '1.695', 'memory/max_active (GiB)': '72.03', 'memory/max_allocated (GiB)': '72.03', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '68.38', 'tokens/total': 589783040, 'tokens/trainable': 217926576, 'epoch': '0.5329'}
 18%|███████████████▋                                                                        | 311/1751 [5:13:50<23:52:09, 59.67s/it] 18%|███████████████▋                                                                        | 312/1751 [5:14:51<23:59:43, 60.03s/it]                                                                                                                                     {'loss': '0.5358', 'grad_norm': '0.207', 'learning_rate': '1.912e-05', 'ppl': '1.709', 'memory/max_active (GiB)': '76.05', 'memory/max_allocated (GiB)': '76.05', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '59.01', 'tokens/total': 591753344, 'tokens/trainable': 218668656, 'epoch': '0.5346'}
 18%|███████████████▋                                                                        | 312/1751 [5:14:51<23:59:43, 60.03s/it] 18%|███████████████▋                                                                        | 313/1751 [5:15:49<23:45:59, 59.50s/it]                                                                                                                                     {'loss': '0.5477', 'grad_norm': '0.2559', 'learning_rate': '1.911e-05', 'ppl': '1.729', 'memory/max_active (GiB)': '73.75', 'memory/max_allocated (GiB)': '73.75', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '140.3', 'tokens/total': 593578944, 'tokens/trainable': 219362208, 'epoch': '0.5364'}
 18%|███████████████▋                                                                        | 313/1751 [5:15:49<23:45:59, 59.50s/it] 18%|███████████████▊                                                                        | 314/1751 [5:16:50<23:55:55, 59.96s/it]                                                                                                                                     {'loss': '0.5612', 'grad_norm': '0.2168', 'learning_rate': '1.91e-05', 'ppl': '1.753', 'memory/max_active (GiB)': '71.39', 'memory/max_allocated (GiB)': '71.39', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '69.67', 'tokens/total': 595496384, 'tokens/trainable': 220074784, 'epoch': '0.5381'}
 18%|███████████████▊                                                                        | 314/1751 [5:16:50<23:55:55, 59.96s/it] 18%|███████████████▊                                                                        | 315/1751 [5:17:48<23:40:35, 59.36s/it]                                                                                                                                     {'loss': '0.5678', 'grad_norm': '0.2178', 'learning_rate': '1.91e-05', 'ppl': '1.764', 'memory/max_active (GiB)': '78.03', 'memory/max_allocated (GiB)': '78.03', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '160.7', 'tokens/total': 597352384, 'tokens/trainable': 220762912, 'epoch': '0.5398'}
 18%|███████████████▊                                                                        | 315/1751 [5:17:48<23:40:35, 59.36s/it] 18%|███████████████▉                                                                        | 316/1751 [5:18:47<23:33:29, 59.10s/it]                                                                                                                                     {'loss': '0.5998', 'grad_norm': '0.2676', 'learning_rate': '1.909e-05', 'ppl': '1.822', 'memory/max_active (GiB)': '66.44', 'memory/max_allocated (GiB)': '66.44', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '124.9', 'tokens/total': 599168128, 'tokens/trainable': 221435440, 'epoch': '0.5415'}
 18%|███████████████▉                                                                        | 316/1751 [5:18:47<23:33:29, 59.10s/it] 18%|███████████████▉                                                                        | 317/1751 [5:19:46<23:35:10, 59.21s/it]                                                                                                                                     {'loss': '0.5401', 'grad_norm': '0.2314', 'learning_rate': '1.908e-05', 'ppl': '1.716', 'memory/max_active (GiB)': '74.78', 'memory/max_allocated (GiB)': '74.78', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '99.93', 'tokens/total': 601036800, 'tokens/trainable': 222135712, 'epoch': '0.5432'}
 18%|███████████████▉                                                                        | 317/1751 [5:19:46<23:35:10, 59.21s/it] 18%|███████████████▉                                                                        | 318/1751 [5:20:46<23:37:18, 59.34s/it]                                                                                                                                     {'loss': '0.5173', 'grad_norm': '0.207', 'learning_rate': '1.907e-05', 'ppl': '1.677', 'memory/max_active (GiB)': '72.57', 'memory/max_allocated (GiB)': '72.57', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '126.4', 'tokens/total': 602945088, 'tokens/trainable': 222834528, 'epoch': '0.5449'}
 18%|███████████████▉                                                                        | 318/1751 [5:20:46<23:37:18, 59.34s/it] 18%|████████████████                                                                        | 319/1751 [5:21:46<23:43:09, 59.63s/it]                                                                                                                                     {'loss': '0.5324', 'grad_norm': '0.2236', 'learning_rate': '1.906e-05', 'ppl': '1.703', 'memory/max_active (GiB)': '73.7', 'memory/max_allocated (GiB)': '73.7', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '81.27', 'tokens/total': 604872576, 'tokens/trainable': 223534288, 'epoch': '0.5466'}
 18%|████████████████                                                                        | 319/1751 [5:21:46<23:43:09, 59.63s/it] 18%|████████████████                                                                        | 320/1751 [5:22:49<24:04:37, 60.57s/it]                                                                                                                                     {'loss': '0.5488', 'grad_norm': '0.2012', 'learning_rate': '1.906e-05', 'ppl': '1.731', 'memory/max_active (GiB)': '76.93', 'memory/max_allocated (GiB)': '76.93', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '84.26', 'tokens/total': 606828480, 'tokens/trainable': 224284752, 'epoch': '0.5483'}
 18%|████████████████                                                                        | 320/1751 [5:22:49<24:04:37, 60.57s/it] 18%|████████████████▏                                                                       | 321/1751 [5:23:49<24:00:45, 60.45s/it]                                                                                                                                     {'loss': '0.5145', 'grad_norm': '0.1953', 'learning_rate': '1.905e-05', 'ppl': '1.673', 'memory/max_active (GiB)': '74.82', 'memory/max_allocated (GiB)': '74.82', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '78.52', 'tokens/total': 608747264, 'tokens/trainable': 225013568, 'epoch': '0.5501'}
 18%|████████████████▏                                                                       | 321/1751 [5:23:49<24:00:45, 60.45s/it] 18%|████████████████▏                                                                       | 322/1751 [5:24:49<23:55:06, 60.26s/it]                                                                                                                                     {'loss': '0.4941', 'grad_norm': '0.2119', 'learning_rate': '1.904e-05', 'ppl': '1.639', 'memory/max_active (GiB)': '76.61', 'memory/max_allocated (GiB)': '76.61', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '197.9', 'tokens/total': 610681472, 'tokens/trainable': 225726624, 'epoch': '0.5518'}
 18%|████████████████▏                                                                       | 322/1751 [5:24:49<23:55:06, 60.26s/it] 18%|████████████████▏                                                                       | 323/1751 [5:25:49<23:52:43, 60.20s/it]                                                                                                                                     {'loss': '0.5266', 'grad_norm': '0.2002', 'learning_rate': '1.903e-05', 'ppl': '1.693', 'memory/max_active (GiB)': '72.36', 'memory/max_allocated (GiB)': '72.36', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '46.08', 'tokens/total': 612600832, 'tokens/trainable': 226463600, 'epoch': '0.5535'}
 18%|████████████████▏                                                                       | 323/1751 [5:25:49<23:52:43, 60.20s/it] 19%|████████████████▎                                                                       | 324/1751 [5:26:50<23:57:04, 60.42s/it]                                                                                                                                     {'loss': '0.5394', 'grad_norm': '0.2119', 'learning_rate': '1.902e-05', 'ppl': '1.715', 'memory/max_active (GiB)': '75.37', 'memory/max_allocated (GiB)': '75.37', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '53.48', 'tokens/total': 614547776, 'tokens/trainable': 227185216, 'epoch': '0.5552'}
 19%|████████████████▎                                                                       | 324/1751 [5:26:50<23:57:04, 60.42s/it] 19%|████████████████▎                                                                       | 325/1751 [5:27:47<23:31:14, 59.38s/it]                                                                                                                                     {'loss': '0.5753', 'grad_norm': '0.21', 'learning_rate': '1.902e-05', 'ppl': '1.778', 'memory/max_active (GiB)': '73.12', 'memory/max_allocated (GiB)': '73.12', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '207.4', 'tokens/total': 616317632, 'tokens/trainable': 227864512, 'epoch': '0.5569'}
 19%|████████████████▎                                                                       | 325/1751 [5:27:47<23:31:14, 59.38s/it] 19%|████████████████▍                                                                       | 326/1751 [5:28:46<23:29:14, 59.34s/it]                                                                                                                                     {'loss': '0.5718', 'grad_norm': '0.2168', 'learning_rate': '1.901e-05', 'ppl': '1.771', 'memory/max_active (GiB)': '75.81', 'memory/max_allocated (GiB)': '75.81', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '52.74', 'tokens/total': 618149632, 'tokens/trainable': 228548224, 'epoch': '0.5586'}
 19%|████████████████▍                                                                       | 326/1751 [5:28:46<23:29:14, 59.34s/it] 19%|████████████████▍                                                                       | 327/1751 [5:29:45<23:24:47, 59.19s/it]                                                                                                                                     {'loss': '0.5451', 'grad_norm': '0.1992', 'learning_rate': '1.9e-05', 'ppl': '1.725', 'memory/max_active (GiB)': '76.32', 'memory/max_allocated (GiB)': '76.32', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '213.2', 'tokens/total': 620063296, 'tokens/trainable': 229245184, 'epoch': '0.5603'}
 19%|████████████████▍                                                                       | 327/1751 [5:29:45<23:24:47, 59.19s/it] 19%|████████████████▍                                                                       | 328/1751 [5:30:44<23:20:34, 59.05s/it]                                                                                                                                     {'loss': '0.5231', 'grad_norm': '0.1934', 'learning_rate': '1.899e-05', 'ppl': '1.687', 'memory/max_active (GiB)': '76.65', 'memory/max_allocated (GiB)': '76.65', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '72.59', 'tokens/total': 621919552, 'tokens/trainable': 229935392, 'epoch': '0.5621'}
 19%|████████████████▍                                                                       | 328/1751 [5:30:44<23:20:34, 59.05s/it] 19%|████████████████▌                                                                       | 329/1751 [5:31:44<23:27:56, 59.41s/it]                                                                                                                                     {'loss': '0.5427', 'grad_norm': '0.208', 'learning_rate': '1.898e-05', 'ppl': '1.721', 'memory/max_active (GiB)': '72.09', 'memory/max_allocated (GiB)': '72.09', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '31', 'tokens/total': 623828352, 'tokens/trainable': 230636400, 'epoch': '0.5638'}
 19%|████████████████▌                                                                       | 329/1751 [5:31:44<23:27:56, 59.41s/it] 19%|████████████████▌                                                                       | 330/1751 [5:32:44<23:30:21, 59.55s/it]                                                                                                                                     {'loss': '0.5523', 'grad_norm': '0.2178', 'learning_rate': '1.897e-05', 'ppl': '1.737', 'memory/max_active (GiB)': '73.78', 'memory/max_allocated (GiB)': '73.78', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '109', 'tokens/total': 625782912, 'tokens/trainable': 231361024, 'epoch': '0.5655'}
 19%|████████████████▌                                                                       | 330/1751 [5:32:44<23:30:21, 59.55s/it] 19%|████████████████▋                                                                       | 331/1751 [5:33:42<23:17:55, 59.07s/it]                                                                                                                                     {'loss': '0.5707', 'grad_norm': '0.2178', 'learning_rate': '1.897e-05', 'ppl': '1.77', 'memory/max_active (GiB)': '72.49', 'memory/max_allocated (GiB)': '72.49', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '98.49', 'tokens/total': 627576960, 'tokens/trainable': 232015168, 'epoch': '0.5672'}
 19%|████████████████▋                                                                       | 331/1751 [5:33:42<23:17:55, 59.07s/it] 19%|████████████████▋                                                                       | 332/1751 [5:34:41<23:18:38, 59.14s/it]                                                                                                                                     {'loss': '0.5521', 'grad_norm': '0.2021', 'learning_rate': '1.896e-05', 'ppl': '1.737', 'memory/max_active (GiB)': '75.46', 'memory/max_allocated (GiB)': '75.46', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '56.26', 'tokens/total': 629441472, 'tokens/trainable': 232708176, 'epoch': '0.5689'}
 19%|████████████████▋                                                                       | 332/1751 [5:34:41<23:18:38, 59.14s/it] 19%|████████████████▋                                                                       | 333/1751 [5:35:40<23:19:36, 59.22s/it]                                                                                                                                     {'loss': '0.5466', 'grad_norm': '0.208', 'learning_rate': '1.895e-05', 'ppl': '1.727', 'memory/max_active (GiB)': '71.07', 'memory/max_allocated (GiB)': '71.07', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '92.77', 'tokens/total': 631320192, 'tokens/trainable': 233359280, 'epoch': '0.5706'}
 19%|████████████████▋                                                                       | 333/1751 [5:35:40<23:19:36, 59.22s/it] 19%|████████████████▊                                                                       | 334/1751 [5:36:42<23:32:44, 59.82s/it]                                                                                                                                     {'loss': '0.5183', 'grad_norm': '0.1953', 'learning_rate': '1.894e-05', 'ppl': '1.679', 'memory/max_active (GiB)': '73.72', 'memory/max_allocated (GiB)': '73.72', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '45.9', 'tokens/total': 633255808, 'tokens/trainable': 234070240, 'epoch': '0.5723'}
 19%|████████████████▊                                                                       | 334/1751 [5:36:42<23:32:44, 59.82s/it] 19%|████████████████▊                                                                       | 335/1751 [5:37:43<23:40:17, 60.18s/it]                                                                                                                                     {'loss': '0.5397', 'grad_norm': '0.1934', 'learning_rate': '1.893e-05', 'ppl': '1.715', 'memory/max_active (GiB)': '71.45', 'memory/max_allocated (GiB)': '71.45', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '43.1', 'tokens/total': 635170432, 'tokens/trainable': 234785072, 'epoch': '0.5741'}
 19%|████████████████▊                                                                       | 335/1751 [5:37:43<23:40:17, 60.18s/it] 19%|████████████████▉                                                                       | 336/1751 [5:38:39<23:11:34, 59.01s/it]                                                                                                                                     {'loss': '0.5697', 'grad_norm': '0.2139', 'learning_rate': '1.892e-05', 'ppl': '1.768', 'memory/max_active (GiB)': '74.09', 'memory/max_allocated (GiB)': '74.09', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '90.64', 'tokens/total': 636938816, 'tokens/trainable': 235431936, 'epoch': '0.5758'}
 19%|████████████████▉                                                                       | 336/1751 [5:38:39<23:11:34, 59.01s/it] 19%|████████████████▉                                                                       | 337/1751 [5:39:38<23:11:59, 59.07s/it]                                                                                                                                     {'loss': '0.5486', 'grad_norm': '0.1924', 'learning_rate': '1.892e-05', 'ppl': '1.731', 'memory/max_active (GiB)': '75.51', 'memory/max_allocated (GiB)': '75.51', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '66.93', 'tokens/total': 638859008, 'tokens/trainable': 236180032, 'epoch': '0.5775'}
 19%|████████████████▉                                                                       | 337/1751 [5:39:38<23:11:59, 59.07s/it] 19%|████████████████▉                                                                       | 338/1751 [5:40:37<23:12:35, 59.13s/it]                                                                                                                                     {'loss': '0.5642', 'grad_norm': '0.21', 'learning_rate': '1.891e-05', 'ppl': '1.758', 'memory/max_active (GiB)': '69.41', 'memory/max_allocated (GiB)': '69.41', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '158.5', 'tokens/total': 640700224, 'tokens/trainable': 236893136, 'epoch': '0.5792'}
 19%|████████████████▉                                                                       | 338/1751 [5:40:37<23:12:35, 59.13s/it] 19%|█████████████████                                                                       | 339/1751 [5:41:38<23:21:42, 59.56s/it]                                                                                                                                     {'loss': '0.5288', 'grad_norm': '0.1914', 'learning_rate': '1.89e-05', 'ppl': '1.697', 'memory/max_active (GiB)': '76.36', 'memory/max_allocated (GiB)': '76.36', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '171.4', 'tokens/total': 642624768, 'tokens/trainable': 237611872, 'epoch': '0.5809'}
 19%|█████████████████                                                                       | 339/1751 [5:41:38<23:21:42, 59.56s/it] 19%|█████████████████                                                                       | 340/1751 [5:42:37<23:17:04, 59.41s/it]                                                                                                                                     {'loss': '0.5527', 'grad_norm': '0.2139', 'learning_rate': '1.889e-05', 'ppl': '1.738', 'memory/max_active (GiB)': '73.84', 'memory/max_allocated (GiB)': '73.84', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '144.8', 'tokens/total': 644430080, 'tokens/trainable': 238274080, 'epoch': '0.5826'}
 19%|█████████████████                                                                       | 340/1751 [5:42:37<23:17:04, 59.41s/it] 19%|█████████████████▏                                                                      | 341/1751 [5:43:40<23:38:44, 60.37s/it]                                                                                                                                     {'loss': '0.5226', 'grad_norm': '0.2051', 'learning_rate': '1.888e-05', 'ppl': '1.686', 'memory/max_active (GiB)': '75.25', 'memory/max_allocated (GiB)': '75.25', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '98.55', 'tokens/total': 646456768, 'tokens/trainable': 238996096, 'epoch': '0.5843'}
 19%|█████████████████▏                                                                      | 341/1751 [5:43:40<23:38:44, 60.37s/it] 20%|█████████████████▏                                                                      | 342/1751 [5:44:39<23:31:06, 60.09s/it]                                                                                                                                     {'loss': '0.5399', 'grad_norm': '0.2139', 'learning_rate': '1.887e-05', 'ppl': '1.716', 'memory/max_active (GiB)': '72.8', 'memory/max_allocated (GiB)': '72.8', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '47.54', 'tokens/total': 648324160, 'tokens/trainable': 239666688, 'epoch': '0.586'}
 20%|█████████████████▏                                                                      | 342/1751 [5:44:39<23:31:06, 60.09s/it] 20%|█████████████████▏                                                                      | 343/1751 [5:45:39<23:29:23, 60.06s/it]                                                                                                                                     {'loss': '0.5423', 'grad_norm': '0.2051', 'learning_rate': '1.886e-05', 'ppl': '1.72', 'memory/max_active (GiB)': '70.04', 'memory/max_allocated (GiB)': '70.04', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '80.24', 'tokens/total': 650249216, 'tokens/trainable': 240373744, 'epoch': '0.5878'}
 20%|█████████████████▏                                                                      | 343/1751 [5:45:39<23:29:23, 60.06s/it] 20%|█████████████████▎                                                                      | 344/1751 [5:46:39<23:24:08, 59.88s/it]                                                                                                                                     {'loss': '0.5418', 'grad_norm': '0.209', 'learning_rate': '1.885e-05', 'ppl': '1.719', 'memory/max_active (GiB)': '73.57', 'memory/max_allocated (GiB)': '73.57', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '68.64', 'tokens/total': 652107264, 'tokens/trainable': 241097936, 'epoch': '0.5895'}
 20%|█████████████████▎                                                                      | 344/1751 [5:46:39<23:24:08, 59.88s/it] 20%|█████████████████▎                                                                      | 345/1751 [5:47:37<23:14:28, 59.51s/it]                                                                                                                                     {'loss': '0.5291', 'grad_norm': '0.2246', 'learning_rate': '1.885e-05', 'ppl': '1.697', 'memory/max_active (GiB)': '72.27', 'memory/max_allocated (GiB)': '72.27', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '120.7', 'tokens/total': 653913280, 'tokens/trainable': 241788992, 'epoch': '0.5912'}
 20%|█████████████████▎                                                                      | 345/1751 [5:47:37<23:14:28, 59.51s/it] 20%|█████████████████▍                                                                      | 346/1751 [5:48:40<23:35:49, 60.46s/it]                                                                                                                                     {'loss': '0.5088', 'grad_norm': '0.1973', 'learning_rate': '1.884e-05', 'ppl': '1.663', 'memory/max_active (GiB)': '75.69', 'memory/max_allocated (GiB)': '75.69', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '113.9', 'tokens/total': 655967232, 'tokens/trainable': 242545408, 'epoch': '0.5929'}
 20%|█████████████████▍                                                                      | 346/1751 [5:48:40<23:35:49, 60.46s/it] 20%|█████████████████▍                                                                      | 347/1751 [5:49:38<23:19:49, 59.82s/it]                                                                                                                                     {'loss': '0.5584', 'grad_norm': '0.249', 'learning_rate': '1.883e-05', 'ppl': '1.748', 'memory/max_active (GiB)': '68.74', 'memory/max_allocated (GiB)': '68.74', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '148.3', 'tokens/total': 657794560, 'tokens/trainable': 243235984, 'epoch': '0.5946'}
 20%|█████████████████▍                                                                      | 347/1751 [5:49:38<23:19:49, 59.82s/it] 20%|█████████████████▍                                                                      | 348/1751 [5:50:35<23:00:02, 59.02s/it]                                                                                                                                     {'loss': '0.5501', 'grad_norm': '0.2334', 'learning_rate': '1.882e-05', 'ppl': '1.733', 'memory/max_active (GiB)': '70.39', 'memory/max_allocated (GiB)': '70.39', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '27.13', 'tokens/total': 659571392, 'tokens/trainable': 243861568, 'epoch': '0.5963'}
 20%|█████████████████▍                                                                      | 348/1751 [5:50:35<23:00:02, 59.02s/it] 20%|█████████████████▌                                                                      | 349/1751 [5:51:37<23:14:14, 59.67s/it]                                                                                                                                     {'loss': '0.5621', 'grad_norm': '0.207', 'learning_rate': '1.881e-05', 'ppl': '1.754', 'memory/max_active (GiB)': '73.9', 'memory/max_allocated (GiB)': '73.9', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '118.7', 'tokens/total': 661537792, 'tokens/trainable': 244585616, 'epoch': '0.598'}
 20%|█████████████████▌                                                                      | 349/1751 [5:51:37<23:14:14, 59.67s/it] 20%|█████████████████▌                                                                      | 350/1751 [5:52:37<23:19:57, 59.96s/it]                                                                                                                                     {'loss': '0.5139', 'grad_norm': '0.2119', 'learning_rate': '1.88e-05', 'ppl': '1.672', 'memory/max_active (GiB)': '71.51', 'memory/max_allocated (GiB)': '71.51', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '163.5', 'tokens/total': 663436480, 'tokens/trainable': 245281920, 'epoch': '0.5998'}
 20%|█████████████████▌                                                                      | 350/1751 [5:52:37<23:19:57, 59.96s/it] 20%|█████████████████▋                                                                      | 351/1751 [5:53:37<23:16:27, 59.85s/it]                                                                                                                                     {'loss': '0.5435', 'grad_norm': '0.2305', 'learning_rate': '1.879e-05', 'ppl': '1.722', 'memory/max_active (GiB)': '72.5', 'memory/max_allocated (GiB)': '72.5', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '188.9', 'tokens/total': 665310976, 'tokens/trainable': 245980768, 'epoch': '0.6015'}
 20%|█████████████████▋                                                                      | 351/1751 [5:53:37<23:16:27, 59.85s/it] 20%|█████████████████▋                                                                      | 352/1751 [5:54:37<23:17:48, 59.95s/it]                                                                                                                                     {'loss': '0.5294', 'grad_norm': '0.2168', 'learning_rate': '1.878e-05', 'ppl': '1.698', 'memory/max_active (GiB)': '74.77', 'memory/max_allocated (GiB)': '74.77', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '56.2', 'tokens/total': 667210368, 'tokens/trainable': 246680256, 'epoch': '0.6032'}
 20%|█████████████████▋                                                                      | 352/1751 [5:54:37<23:17:48, 59.95s/it] 20%|█████████████████▋                                                                      | 353/1751 [5:55:38<23:24:46, 60.29s/it]                                                                                                                                     {'loss': '0.5192', 'grad_norm': '0.2021', 'learning_rate': '1.877e-05', 'ppl': '1.681', 'memory/max_active (GiB)': '75.63', 'memory/max_allocated (GiB)': '75.63', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '167', 'tokens/total': 669142272, 'tokens/trainable': 247383376, 'epoch': '0.6049'}
 20%|█████████████████▋                                                                      | 353/1751 [5:55:38<23:24:46, 60.29s/it] 20%|█████████████████▊                                                                      | 354/1751 [5:56:40<23:38:54, 60.94s/it]                                                                                                                                     {'loss': '0.5083', 'grad_norm': '0.2139', 'learning_rate': '1.877e-05', 'ppl': '1.663', 'memory/max_active (GiB)': '76.71', 'memory/max_allocated (GiB)': '76.71', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '175.3', 'tokens/total': 671125184, 'tokens/trainable': 248126624, 'epoch': '0.6066'}
 20%|█████████████████▊                                                                      | 354/1751 [5:56:40<23:38:54, 60.94s/it] 20%|█████████████████▊                                                                      | 355/1751 [5:57:40<23:28:50, 60.55s/it]                                                                                                                                     {'loss': '0.5455', 'grad_norm': '0.2012', 'learning_rate': '1.876e-05', 'ppl': '1.725', 'memory/max_active (GiB)': '75.06', 'memory/max_allocated (GiB)': '75.06', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '107.8', 'tokens/total': 673014080, 'tokens/trainable': 248823328, 'epoch': '0.6083'}
 20%|█████████████████▊                                                                      | 355/1751 [5:57:40<23:28:50, 60.55s/it] 20%|█████████████████▉                                                                      | 356/1751 [5:58:41<23:28:55, 60.60s/it]                                                                                                                                     {'loss': '0.5386', 'grad_norm': '0.1924', 'learning_rate': '1.875e-05', 'ppl': '1.714', 'memory/max_active (GiB)': '75.05', 'memory/max_allocated (GiB)': '75.05', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '51.43', 'tokens/total': 674965696, 'tokens/trainable': 249543056, 'epoch': '0.61'}
 20%|█████████████████▉                                                                      | 356/1751 [5:58:41<23:28:55, 60.60s/it] 20%|█████████████████▉                                                                      | 357/1751 [5:59:40<23:20:54, 60.30s/it]                                                                                                                                     {'loss': '0.5586', 'grad_norm': '0.2051', 'learning_rate': '1.874e-05', 'ppl': '1.748', 'memory/max_active (GiB)': '72.66', 'memory/max_allocated (GiB)': '72.66', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '53.15', 'tokens/total': 676836480, 'tokens/trainable': 250238624, 'epoch': '0.6118'}
 20%|█████████████████▉                                                                      | 357/1751 [5:59:40<23:20:54, 60.30s/it] 20%|█████████████████▉                                                                      | 358/1751 [6:00:45<23:46:26, 61.44s/it]                                                                                                                                     {'loss': '0.5177', 'grad_norm': '0.1914', 'learning_rate': '1.873e-05', 'ppl': '1.678', 'memory/max_active (GiB)': '74.13', 'memory/max_allocated (GiB)': '74.13', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '44.15', 'tokens/total': 678963904, 'tokens/trainable': 250996272, 'epoch': '0.6135'}
 20%|█████████████████▉                                                                      | 358/1751 [6:00:45<23:46:26, 61.44s/it] 21%|██████████████████                                                                      | 359/1751 [6:01:44<23:33:04, 60.91s/it]                                                                                                                                     {'loss': '0.5432', 'grad_norm': '0.21', 'learning_rate': '1.872e-05', 'ppl': '1.722', 'memory/max_active (GiB)': '78.64', 'memory/max_allocated (GiB)': '78.64', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '61.34', 'tokens/total': 680848064, 'tokens/trainable': 251681824, 'epoch': '0.6152'}
 21%|██████████████████                                                                      | 359/1751 [6:01:44<23:33:04, 60.91s/it] 21%|██████████████████                                                                      | 360/1751 [6:02:44<23:26:27, 60.67s/it]                                                                                                                                     {'loss': '0.5485', 'grad_norm': '0.2012', 'learning_rate': '1.871e-05', 'ppl': '1.731', 'memory/max_active (GiB)': '75.96', 'memory/max_allocated (GiB)': '75.96', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '27.21', 'tokens/total': 682779456, 'tokens/trainable': 252399328, 'epoch': '0.6169'}
 21%|██████████████████                                                                      | 360/1751 [6:02:44<23:26:27, 60.67s/it] 21%|██████████████████▏                                                                     | 361/1751 [6:03:41<22:56:30, 59.42s/it]                                                                                                                                     {'loss': '0.5457', 'grad_norm': '0.2041', 'learning_rate': '1.87e-05', 'ppl': '1.726', 'memory/max_active (GiB)': '72.95', 'memory/max_allocated (GiB)': '72.95', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '59.64', 'tokens/total': 684594432, 'tokens/trainable': 253053312, 'epoch': '0.6186'}
 21%|██████████████████▏                                                                     | 361/1751 [6:03:41<22:56:30, 59.42s/it] 21%|██████████████████▏                                                                     | 362/1751 [6:04:38<22:37:21, 58.63s/it]                                                                                                                                     {'loss': '0.5483', 'grad_norm': '0.2109', 'learning_rate': '1.869e-05', 'ppl': '1.73', 'memory/max_active (GiB)': '65.08', 'memory/max_allocated (GiB)': '65.08', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '89.23', 'tokens/total': 686345152, 'tokens/trainable': 253707008, 'epoch': '0.6203'}
 21%|██████████████████▏                                                                     | 362/1751 [6:04:38<22:37:21, 58.63s/it] 21%|██████████████████▏                                                                     | 363/1751 [6:05:39<22:55:19, 59.45s/it]                                                                                                                                     {'loss': '0.545', 'grad_norm': '0.1973', 'learning_rate': '1.868e-05', 'ppl': '1.725', 'memory/max_active (GiB)': '76.62', 'memory/max_allocated (GiB)': '76.62', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '127.9', 'tokens/total': 688321472, 'tokens/trainable': 254409280, 'epoch': '0.622'}
 21%|██████████████████▏                                                                     | 363/1751 [6:05:39<22:55:19, 59.45s/it] 21%|██████████████████▎                                                                     | 364/1751 [6:06:37<22:41:19, 58.89s/it]                                                                                                                                     {'loss': '0.5743', 'grad_norm': '0.2246', 'learning_rate': '1.867e-05', 'ppl': '1.776', 'memory/max_active (GiB)': '74.91', 'memory/max_allocated (GiB)': '74.91', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '97.05', 'tokens/total': 690176896, 'tokens/trainable': 255080560, 'epoch': '0.6237'}
 21%|██████████████████▎                                                                     | 364/1751 [6:06:37<22:41:19, 58.89s/it] 21%|██████████████████▎                                                                     | 365/1751 [6:07:35<22:33:50, 58.61s/it]                                                                                                                                     {'loss': '0.5375', 'grad_norm': '0.2021', 'learning_rate': '1.866e-05', 'ppl': '1.712', 'memory/max_active (GiB)': '73.94', 'memory/max_allocated (GiB)': '73.94', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '46.87', 'tokens/total': 691966080, 'tokens/trainable': 255764320, 'epoch': '0.6255'}
 21%|██████████████████▎                                                                     | 365/1751 [6:07:35<22:33:50, 58.61s/it] 21%|██████████████████▍                                                                     | 366/1751 [6:08:35<22:47:08, 59.23s/it]                                                                                                                                     {'loss': '0.5363', 'grad_norm': '0.2148', 'learning_rate': '1.865e-05', 'ppl': '1.71', 'memory/max_active (GiB)': '75.68', 'memory/max_allocated (GiB)': '75.68', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '66.18', 'tokens/total': 693890432, 'tokens/trainable': 256469456, 'epoch': '0.6272'}
 21%|██████████████████▍                                                                     | 366/1751 [6:08:35<22:47:08, 59.23s/it] 21%|██████████████████▍                                                                     | 367/1751 [6:09:37<23:03:56, 60.00s/it]                                                                                                                                     {'loss': '0.5377', 'grad_norm': '0.1992', 'learning_rate': '1.864e-05', 'ppl': '1.712', 'memory/max_active (GiB)': '74.73', 'memory/max_allocated (GiB)': '74.73', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '107.2', 'tokens/total': 695844160, 'tokens/trainable': 257196432, 'epoch': '0.6289'}
 21%|██████████████████▍                                                                     | 367/1751 [6:09:37<23:03:56, 60.00s/it] 21%|██████████████████▍                                                                     | 368/1751 [6:10:37<23:02:23, 59.97s/it]                                                                                                                                     {'loss': '0.5301', 'grad_norm': '0.2168', 'learning_rate': '1.863e-05', 'ppl': '1.699', 'memory/max_active (GiB)': '70.62', 'memory/max_allocated (GiB)': '70.62', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '116.3', 'tokens/total': 697742464, 'tokens/trainable': 257909152, 'epoch': '0.6306'}
 21%|██████████████████▍                                                                     | 368/1751 [6:10:37<23:02:23, 59.97s/it] 21%|██████████████████▌                                                                     | 369/1751 [6:11:36<22:57:59, 59.83s/it]                                                                                                                                     {'loss': '0.53', 'grad_norm': '0.2256', 'learning_rate': '1.863e-05', 'ppl': '1.699', 'memory/max_active (GiB)': '75.19', 'memory/max_allocated (GiB)': '75.19', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '61.82', 'tokens/total': 699580416, 'tokens/trainable': 258560912, 'epoch': '0.6323'}
 21%|██████████████████▌                                                                     | 369/1751 [6:11:36<22:57:59, 59.83s/it] 21%|██████████████████▌                                                                     | 370/1751 [6:12:34<22:39:12, 59.05s/it]                                                                                                                                     {'loss': '0.5315', 'grad_norm': '0.2041', 'learning_rate': '1.862e-05', 'ppl': '1.702', 'memory/max_active (GiB)': '73.71', 'memory/max_allocated (GiB)': '73.71', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '14.11', 'tokens/total': 701380416, 'tokens/trainable': 259186928, 'epoch': '0.634'}
 21%|██████████████████▌                                                                     | 370/1751 [6:12:34<22:39:12, 59.05s/it] 21%|██████████████████▋                                                                     | 371/1751 [6:13:34<22:49:14, 59.53s/it]                                                                                                                                     {'loss': '0.5162', 'grad_norm': '0.21', 'learning_rate': '1.861e-05', 'ppl': '1.676', 'memory/max_active (GiB)': '72.49', 'memory/max_allocated (GiB)': '72.49', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '160.1', 'tokens/total': 703291584, 'tokens/trainable': 259843328, 'epoch': '0.6357'}
 21%|██████████████████▋                                                                     | 371/1751 [6:13:34<22:49:14, 59.53s/it] 21%|██████████████████▋                                                                     | 372/1751 [6:14:33<22:42:39, 59.29s/it]                                                                                                                                     {'loss': '0.5571', 'grad_norm': '0.2266', 'learning_rate': '1.86e-05', 'ppl': '1.746', 'memory/max_active (GiB)': '76.17', 'memory/max_allocated (GiB)': '76.17', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '78.89', 'tokens/total': 705131520, 'tokens/trainable': 260500688, 'epoch': '0.6375'}
 21%|██████████████████▋                                                                     | 372/1751 [6:14:33<22:42:39, 59.29s/it] 21%|██████████████████▋                                                                     | 373/1751 [6:15:33<22:48:57, 59.61s/it]                                                                                                                                     {'loss': '0.5421', 'grad_norm': '0.2021', 'learning_rate': '1.859e-05', 'ppl': '1.72', 'memory/max_active (GiB)': '69.68', 'memory/max_allocated (GiB)': '69.68', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '159.8', 'tokens/total': 707043520, 'tokens/trainable': 261230448, 'epoch': '0.6392'}
 21%|██████████████████▋                                                                     | 373/1751 [6:15:33<22:48:57, 59.61s/it] 21%|██████████████████▊                                                                     | 374/1751 [6:16:31<22:37:49, 59.16s/it]                                                                                                                                     {'loss': '0.5386', 'grad_norm': '0.2061', 'learning_rate': '1.858e-05', 'ppl': '1.714', 'memory/max_active (GiB)': '77.82', 'memory/max_allocated (GiB)': '77.82', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '132.4', 'tokens/total': 708883584, 'tokens/trainable': 261921408, 'epoch': '0.6409'}
 21%|██████████████████▊                                                                     | 374/1751 [6:16:31<22:37:49, 59.16s/it] 21%|██████████████████▊                                                                     | 375/1751 [6:17:32<22:46:33, 59.59s/it]                                                                                                                                     {'loss': '0.508', 'grad_norm': '0.1953', 'learning_rate': '1.857e-05', 'ppl': '1.662', 'memory/max_active (GiB)': '76.7', 'memory/max_allocated (GiB)': '76.7', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '45.84', 'tokens/total': 710763840, 'tokens/trainable': 262643536, 'epoch': '0.6426'}
 21%|██████████████████▊                                                                     | 375/1751 [6:17:32<22:46:33, 59.59s/it] 21%|██████████████████▉                                                                     | 376/1751 [6:18:31<22:41:52, 59.43s/it]                                                                                                                                     {'loss': '0.5605', 'grad_norm': '0.207', 'learning_rate': '1.856e-05', 'ppl': '1.751', 'memory/max_active (GiB)': '76.39', 'memory/max_allocated (GiB)': '76.39', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '140.2', 'tokens/total': 712613248, 'tokens/trainable': 263310608, 'epoch': '0.6443'}
 21%|██████████████████▉                                                                     | 376/1751 [6:18:31<22:41:52, 59.43s/it] 22%|██████████████████▉                                                                     | 377/1751 [6:19:32<22:51:14, 59.88s/it]                                                                                                                                     {'loss': '0.5162', 'grad_norm': '0.1973', 'learning_rate': '1.855e-05', 'ppl': '1.676', 'memory/max_active (GiB)': '75.64', 'memory/max_allocated (GiB)': '75.64', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '113', 'tokens/total': 714562560, 'tokens/trainable': 264046272, 'epoch': '0.646'}
 22%|██████████████████▉                                                                     | 377/1751 [6:19:32<22:51:14, 59.88s/it] 22%|██████████████████▉                                                                     | 378/1751 [6:20:34<23:06:21, 60.58s/it]                                                                                                                                     {'loss': '0.5083', 'grad_norm': '0.1895', 'learning_rate': '1.854e-05', 'ppl': '1.662', 'memory/max_active (GiB)': '76.02', 'memory/max_allocated (GiB)': '76.02', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '104.5', 'tokens/total': 716562752, 'tokens/trainable': 264804208, 'epoch': '0.6477'}
 22%|██████████████████▉                                                                     | 378/1751 [6:20:34<23:06:21, 60.58s/it] 22%|███████████████████                                                                     | 379/1751 [6:21:34<22:56:31, 60.20s/it]                                                                                                                                     {'loss': '0.5403', 'grad_norm': '0.1973', 'learning_rate': '1.853e-05', 'ppl': '1.717', 'memory/max_active (GiB)': '72.71', 'memory/max_allocated (GiB)': '72.71', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '78.61', 'tokens/total': 718412864, 'tokens/trainable': 265462656, 'epoch': '0.6495'}
 22%|███████████████████                                                                     | 379/1751 [6:21:34<22:56:31, 60.20s/it] 22%|███████████████████                                                                     | 380/1751 [6:22:32<22:40:55, 59.56s/it]                                                                                                                                     {'loss': '0.574', 'grad_norm': '0.2119', 'learning_rate': '1.852e-05', 'ppl': '1.775', 'memory/max_active (GiB)': '72.69', 'memory/max_allocated (GiB)': '72.69', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '110.7', 'tokens/total': 720271616, 'tokens/trainable': 266154416, 'epoch': '0.6512'}
 22%|███████████████████                                                                     | 380/1751 [6:22:32<22:40:55, 59.56s/it] 22%|███████████████████▏                                                                    | 381/1751 [6:23:31<22:39:42, 59.55s/it]                                                                                                                                     {'loss': '0.5153', 'grad_norm': '0.2197', 'learning_rate': '1.851e-05', 'ppl': '1.674', 'memory/max_active (GiB)': '75.16', 'memory/max_allocated (GiB)': '75.16', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '93.53', 'tokens/total': 722153216, 'tokens/trainable': 266847472, 'epoch': '0.6529'}
 22%|███████████████████▏                                                                    | 381/1751 [6:23:31<22:39:42, 59.55s/it] 22%|███████████████████▏                                                                    | 382/1751 [6:24:31<22:38:35, 59.54s/it]                                                                                                                                     {'loss': '0.5236', 'grad_norm': '0.1992', 'learning_rate': '1.85e-05', 'ppl': '1.688', 'memory/max_active (GiB)': '75.61', 'memory/max_allocated (GiB)': '75.61', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '72.47', 'tokens/total': 724016576, 'tokens/trainable': 267530320, 'epoch': '0.6546'}
 22%|███████████████████▏                                                                    | 382/1751 [6:24:31<22:38:35, 59.54s/it] 22%|███████████████████▏                                                                    | 383/1751 [6:25:29<22:27:37, 59.11s/it]                                                                                                                                     {'loss': '0.5407', 'grad_norm': '0.2119', 'learning_rate': '1.849e-05', 'ppl': '1.717', 'memory/max_active (GiB)': '72.49', 'memory/max_allocated (GiB)': '72.49', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '70.94', 'tokens/total': 725806208, 'tokens/trainable': 268223968, 'epoch': '0.6563'}
 22%|███████████████████▏                                                                    | 383/1751 [6:25:29<22:27:37, 59.11s/it] 22%|███████████████████▎                                                                    | 384/1751 [6:26:29<22:34:41, 59.46s/it]                                                                                                                                     {'loss': '0.5221', 'grad_norm': '0.2051', 'learning_rate': '1.848e-05', 'ppl': '1.686', 'memory/max_active (GiB)': '74.7', 'memory/max_allocated (GiB)': '74.7', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '125.8', 'tokens/total': 727755584, 'tokens/trainable': 268946656, 'epoch': '0.658'}
 22%|███████████████████▎                                                                    | 384/1751 [6:26:29<22:34:41, 59.46s/it] 22%|███████████████████▎                                                                    | 385/1751 [6:27:27<22:24:33, 59.06s/it]                                                                                                                                     {'loss': '0.5355', 'grad_norm': '0.2119', 'learning_rate': '1.847e-05', 'ppl': '1.708', 'memory/max_active (GiB)': '73.61', 'memory/max_allocated (GiB)': '73.61', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '67.27', 'tokens/total': 729592512, 'tokens/trainable': 269568896, 'epoch': '0.6597'}
 22%|███████████████████▎                                                                    | 385/1751 [6:27:27<22:24:33, 59.06s/it] 22%|███████████████████▍                                                                    | 386/1751 [6:28:28<22:34:15, 59.53s/it]                                                                                                                                     {'loss': '0.5218', 'grad_norm': '0.1924', 'learning_rate': '1.846e-05', 'ppl': '1.685', 'memory/max_active (GiB)': '71.25', 'memory/max_allocated (GiB)': '71.25', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '120.4', 'tokens/total': 731514496, 'tokens/trainable': 270302048, 'epoch': '0.6614'}
 22%|███████████████████▍                                                                    | 386/1751 [6:28:28<22:34:15, 59.53s/it] 22%|███████████████████▍                                                                    | 387/1751 [6:29:28<22:38:35, 59.76s/it]                                                                                                                                     {'loss': '0.519', 'grad_norm': '0.1982', 'learning_rate': '1.845e-05', 'ppl': '1.68', 'memory/max_active (GiB)': '73.66', 'memory/max_allocated (GiB)': '73.66', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '138.1', 'tokens/total': 733416384, 'tokens/trainable': 271001312, 'epoch': '0.6632'}
 22%|███████████████████▍                                                                    | 387/1751 [6:29:28<22:38:35, 59.76s/it] 22%|███████████████████▍                                                                    | 388/1751 [6:30:25<22:19:06, 58.95s/it]                                                                                                                                     {'loss': '0.5614', 'grad_norm': '0.2041', 'learning_rate': '1.844e-05', 'ppl': '1.753', 'memory/max_active (GiB)': '73.6', 'memory/max_allocated (GiB)': '73.6', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '56.75', 'tokens/total': 735187328, 'tokens/trainable': 271639488, 'epoch': '0.6649'}
 22%|███████████████████▍                                                                    | 388/1751 [6:30:25<22:19:06, 58.95s/it] 22%|███████████████████▌                                                                    | 389/1751 [6:31:25<22:21:08, 59.08s/it]                                                                                                                                     {'loss': '0.5639', 'grad_norm': '0.2129', 'learning_rate': '1.843e-05', 'ppl': '1.758', 'memory/max_active (GiB)': '74.85', 'memory/max_allocated (GiB)': '74.85', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '63.2', 'tokens/total': 737074048, 'tokens/trainable': 272347232, 'epoch': '0.6666'}
 22%|███████████████████▌                                                                    | 389/1751 [6:31:25<22:21:08, 59.08s/it] 22%|███████████████████▌                                                                    | 390/1751 [6:32:26<22:32:53, 59.64s/it]                                                                                                                                     {'loss': '0.5232', 'grad_norm': '0.1963', 'learning_rate': '1.842e-05', 'ppl': '1.687', 'memory/max_active (GiB)': '74.71', 'memory/max_allocated (GiB)': '74.71', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '56.56', 'tokens/total': 739032320, 'tokens/trainable': 273061216, 'epoch': '0.6683'}
 22%|███████████████████▌                                                                    | 390/1751 [6:32:26<22:32:53, 59.64s/it] 22%|███████████████████▋                                                                    | 391/1751 [6:33:26<22:39:53, 60.00s/it]                                                                                                                                     {'loss': '0.5142', 'grad_norm': '0.2021', 'learning_rate': '1.841e-05', 'ppl': '1.672', 'memory/max_active (GiB)': '75.25', 'memory/max_allocated (GiB)': '75.25', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '130.6', 'tokens/total': 740974272, 'tokens/trainable': 273801984, 'epoch': '0.67'}
 22%|███████████████████▋                                                                    | 391/1751 [6:33:26<22:39:53, 60.00s/it] 22%|███████████████████▋                                                                    | 392/1751 [6:34:25<22:27:46, 59.50s/it]                                                                                                                                     {'loss': '0.559', 'grad_norm': '0.2207', 'learning_rate': '1.84e-05', 'ppl': '1.749', 'memory/max_active (GiB)': '72.42', 'memory/max_allocated (GiB)': '72.42', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '257.4', 'tokens/total': 742834624, 'tokens/trainable': 274470016, 'epoch': '0.6717'}
 22%|███████████████████▋                                                                    | 392/1751 [6:34:25<22:27:46, 59.50s/it] 22%|███████████████████▊                                                                    | 393/1751 [6:35:27<22:45:54, 60.35s/it]                                                                                                                                     {'loss': '0.5201', 'grad_norm': '0.2061', 'learning_rate': '1.839e-05', 'ppl': '1.682', 'memory/max_active (GiB)': '73.29', 'memory/max_allocated (GiB)': '73.29', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '31.32', 'tokens/total': 744875520, 'tokens/trainable': 275184224, 'epoch': '0.6734'}
 22%|███████████████████▊                                                                    | 393/1751 [6:35:27<22:45:54, 60.35s/it] 23%|███████████████████▊                                                                    | 394/1751 [6:36:26<22:35:37, 59.94s/it]                                                                                                                                     {'loss': '0.5489', 'grad_norm': '0.2109', 'learning_rate': '1.838e-05', 'ppl': '1.731', 'memory/max_active (GiB)': '72.04', 'memory/max_allocated (GiB)': '72.04', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '94.72', 'tokens/total': 746742464, 'tokens/trainable': 275856352, 'epoch': '0.6752'}
 23%|███████████████████▊                                                                    | 394/1751 [6:36:26<22:35:37, 59.94s/it] 23%|███████████████████▊                                                                    | 395/1751 [6:37:26<22:34:54, 59.95s/it]                                                                                                                                     {'loss': '0.5474', 'grad_norm': '0.2031', 'learning_rate': '1.837e-05', 'ppl': '1.729', 'memory/max_active (GiB)': '71.6', 'memory/max_allocated (GiB)': '71.6', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '78.22', 'tokens/total': 748658688, 'tokens/trainable': 276546304, 'epoch': '0.6769'}
 23%|███████████████████▊                                                                    | 395/1751 [6:37:26<22:34:54, 59.95s/it] 23%|███████████████████▉                                                                    | 396/1751 [6:38:25<22:25:22, 59.57s/it]                                                                                                                                     {'loss': '0.5526', 'grad_norm': '0.2236', 'learning_rate': '1.836e-05', 'ppl': '1.738', 'memory/max_active (GiB)': '74.19', 'memory/max_allocated (GiB)': '74.19', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '30.71', 'tokens/total': 750560448, 'tokens/trainable': 277189920, 'epoch': '0.6786'}
 23%|███████████████████▉                                                                    | 396/1751 [6:38:25<22:25:22, 59.57s/it] 23%|███████████████████▉                                                                    | 397/1751 [6:39:26<22:37:18, 60.15s/it]                                                                                                                                     {'loss': '0.5414', 'grad_norm': '0.1953', 'learning_rate': '1.835e-05', 'ppl': '1.718', 'memory/max_active (GiB)': '72.77', 'memory/max_allocated (GiB)': '72.77', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '119.9', 'tokens/total': 752561984, 'tokens/trainable': 277930688, 'epoch': '0.6803'}
 23%|███████████████████▉                                                                    | 397/1751 [6:39:26<22:37:18, 60.15s/it] 23%|████████████████████                                                                    | 398/1751 [6:40:26<22:36:15, 60.14s/it]                                                                                                                                     {'loss': '0.5281', 'grad_norm': '0.2158', 'learning_rate': '1.834e-05', 'ppl': '1.696', 'memory/max_active (GiB)': '70.12', 'memory/max_allocated (GiB)': '70.12', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '137.6', 'tokens/total': 754494400, 'tokens/trainable': 278641216, 'epoch': '0.682'}
 23%|████████████████████                                                                    | 398/1751 [6:40:26<22:36:15, 60.14s/it] 23%|████████████████████                                                                    | 399/1751 [6:41:24<22:20:52, 59.51s/it]                                                                                                                                     {'loss': '0.5492', 'grad_norm': '0.21', 'learning_rate': '1.833e-05', 'ppl': '1.732', 'memory/max_active (GiB)': '73.01', 'memory/max_allocated (GiB)': '73.01', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '50.43', 'tokens/total': 756302848, 'tokens/trainable': 279319008, 'epoch': '0.6837'}
 23%|████████████████████                                                                    | 399/1751 [6:41:24<22:20:52, 59.51s/it] 23%|████████████████████                                                                    | 400/1751 [6:42:24<22:23:17, 59.66s/it]                                                                                                                                     {'loss': '0.488', 'grad_norm': '0.1895', 'learning_rate': '1.831e-05', 'ppl': '1.629', 'memory/max_active (GiB)': '70.89', 'memory/max_allocated (GiB)': '70.89', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '60.92', 'tokens/total': 758231936, 'tokens/trainable': 280026752, 'epoch': '0.6854'}
 23%|████████████████████                                                                    | 400/1751 [6:42:24<22:23:17, 59.66s/it] 23%|████████████████████▏                                                                   | 401/1751 [6:43:20<21:56:35, 58.52s/it]                                                                                                                                     {'loss': '0.5454', 'grad_norm': '0.1953', 'learning_rate': '1.83e-05', 'ppl': '1.725', 'memory/max_active (GiB)': '73.9', 'memory/max_allocated (GiB)': '73.9', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '147.9', 'tokens/total': 760032512, 'tokens/trainable': 280683808, 'epoch': '0.6871'}
 23%|████████████████████▏                                                                   | 401/1751 [6:43:20<21:56:35, 58.52s/it] 23%|████████████████████▏                                                                   | 402/1751 [6:44:22<22:18:44, 59.54s/it]                                                                                                                                     {'loss': '0.4906', 'grad_norm': '0.1914', 'learning_rate': '1.829e-05', 'ppl': '1.633', 'memory/max_active (GiB)': '74.42', 'memory/max_allocated (GiB)': '74.42', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '72.43', 'tokens/total': 762054720, 'tokens/trainable': 281426432, 'epoch': '0.6889'}
 23%|████████████████████▏                                                                   | 402/1751 [6:44:22<22:18:44, 59.54s/it] 23%|████████████████████▎                                                                   | 403/1751 [6:45:24<22:33:58, 60.27s/it]                                                                                                                                     {'loss': '0.5421', 'grad_norm': '0.1846', 'learning_rate': '1.828e-05', 'ppl': '1.72', 'memory/max_active (GiB)': '72.16', 'memory/max_allocated (GiB)': '72.16', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '159.5', 'tokens/total': 764009216, 'tokens/trainable': 282172416, 'epoch': '0.6906'}
 23%|████████████████████▎                                                                   | 403/1751 [6:45:24<22:33:58, 60.27s/it] 23%|████████████████████▎                                                                   | 404/1751 [6:46:23<22:27:03, 60.00s/it]                                                                                                                                     {'loss': '0.5553', 'grad_norm': '0.1963', 'learning_rate': '1.827e-05', 'ppl': '1.742', 'memory/max_active (GiB)': '71.8', 'memory/max_allocated (GiB)': '71.8', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '136.8', 'tokens/total': 765847360, 'tokens/trainable': 282883168, 'epoch': '0.6923'}
 23%|████████████████████▎                                                                   | 404/1751 [6:46:23<22:27:03, 60.00s/it] 23%|████████████████████▎                                                                   | 405/1751 [6:47:23<22:23:35, 59.89s/it]                                                                                                                                     {'loss': '0.5174', 'grad_norm': '0.2061', 'learning_rate': '1.826e-05', 'ppl': '1.678', 'memory/max_active (GiB)': '76.11', 'memory/max_allocated (GiB)': '76.11', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '55.02', 'tokens/total': 767745024, 'tokens/trainable': 283585504, 'epoch': '0.694'}
 23%|████████████████████▎                                                                   | 405/1751 [6:47:23<22:23:35, 59.89s/it] 23%|████████████████████▍                                                                   | 406/1751 [6:48:24<22:32:10, 60.32s/it]                                                                                                                                     {'loss': '0.5355', 'grad_norm': '0.2227', 'learning_rate': '1.825e-05', 'ppl': '1.708', 'memory/max_active (GiB)': '73.56', 'memory/max_allocated (GiB)': '73.56', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '33.15', 'tokens/total': 769648576, 'tokens/trainable': 284304896, 'epoch': '0.6957'}
 23%|████████████████████▍                                                                   | 406/1751 [6:48:24<22:32:10, 60.32s/it] 23%|████████████████████▍                                                                   | 407/1751 [6:49:25<22:33:21, 60.42s/it]                                                                                                                                     {'loss': '0.5266', 'grad_norm': '0.2168', 'learning_rate': '1.824e-05', 'ppl': '1.693', 'memory/max_active (GiB)': '75.85', 'memory/max_allocated (GiB)': '75.85', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '72.39', 'tokens/total': 771557120, 'tokens/trainable': 284989536, 'epoch': '0.6974'}
 23%|████████████████████▍                                                                   | 407/1751 [6:49:25<22:33:21, 60.42s/it] 23%|████████████████████▌                                                                   | 408/1751 [6:50:26<22:36:14, 60.59s/it]                                                                                                                                     {'loss': '0.5439', 'grad_norm': '0.208', 'learning_rate': '1.823e-05', 'ppl': '1.723', 'memory/max_active (GiB)': '76.33', 'memory/max_allocated (GiB)': '76.33', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '61.83', 'tokens/total': 773505408, 'tokens/trainable': 285722528, 'epoch': '0.6991'}
 23%|████████████████████▌                                                                   | 408/1751 [6:50:26<22:36:14, 60.59s/it] 23%|████████████████████▌                                                                   | 409/1751 [6:51:25<22:22:44, 60.03s/it]                                                                                                                                     {'loss': '0.5514', 'grad_norm': '0.2158', 'learning_rate': '1.822e-05', 'ppl': '1.736', 'memory/max_active (GiB)': '74.96', 'memory/max_allocated (GiB)': '74.96', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '61.04', 'tokens/total': 775376704, 'tokens/trainable': 286434048, 'epoch': '0.7009'}
 23%|████████████████████▌                                                                   | 409/1751 [6:51:25<22:22:44, 60.03s/it] 23%|████████████████████▌                                                                   | 410/1751 [6:52:23<22:10:20, 59.52s/it]                                                                                                                                     {'loss': '0.5428', 'grad_norm': '0.1924', 'learning_rate': '1.821e-05', 'ppl': '1.721', 'memory/max_active (GiB)': '76.99', 'memory/max_allocated (GiB)': '76.99', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '168.7', 'tokens/total': 777240896, 'tokens/trainable': 287120160, 'epoch': '0.7026'}
 23%|████████████████████▌                                                                   | 410/1751 [6:52:23<22:10:20, 59.52s/it] 23%|████████████████████▋                                                                   | 411/1751 [6:53:21<22:01:25, 59.17s/it]                                                                                                                                     {'loss': '0.527', 'grad_norm': '0.207', 'learning_rate': '1.82e-05', 'ppl': '1.694', 'memory/max_active (GiB)': '76.77', 'memory/max_allocated (GiB)': '76.77', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '63.64', 'tokens/total': 779057664, 'tokens/trainable': 287787008, 'epoch': '0.7043'}
 23%|████████████████████▋                                                                   | 411/1751 [6:53:21<22:01:25, 59.17s/it] 24%|████████████████████▋                                                                   | 412/1751 [6:54:19<21:51:16, 58.76s/it]                                                                                                                                     {'loss': '0.5892', 'grad_norm': '0.2383', 'learning_rate': '1.819e-05', 'ppl': '1.803', 'memory/max_active (GiB)': '76.41', 'memory/max_allocated (GiB)': '76.41', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '134.5', 'tokens/total': 780839872, 'tokens/trainable': 288437184, 'epoch': '0.706'}
 24%|████████████████████▋                                                                   | 412/1751 [6:54:19<21:51:16, 58.76s/it] 24%|████████████████████▊                                                                   | 413/1751 [6:55:21<22:12:16, 59.74s/it]                                                                                                                                     {'loss': '0.5203', 'grad_norm': '0.1934', 'learning_rate': '1.818e-05', 'ppl': '1.682', 'memory/max_active (GiB)': '74.58', 'memory/max_allocated (GiB)': '74.58', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '20.35', 'tokens/total': 782872768, 'tokens/trainable': 289179744, 'epoch': '0.7077'}
 24%|████████████████████▊                                                                   | 413/1751 [6:55:21<22:12:16, 59.74s/it] 24%|████████████████████▊                                                                   | 414/1751 [6:56:22<22:15:41, 59.94s/it]                                                                                                                                     {'loss': '0.5245', 'grad_norm': '0.2061', 'learning_rate': '1.816e-05', 'ppl': '1.69', 'memory/max_active (GiB)': '71.64', 'memory/max_allocated (GiB)': '71.64', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '77.3', 'tokens/total': 784787648, 'tokens/trainable': 289854208, 'epoch': '0.7094'}
 24%|████████████████████▊                                                                   | 414/1751 [6:56:22<22:15:41, 59.94s/it] 24%|████████████████████▊                                                                   | 415/1751 [6:57:21<22:07:50, 59.63s/it]                                                                                                                                     {'loss': '0.5233', 'grad_norm': '0.1973', 'learning_rate': '1.815e-05', 'ppl': '1.688', 'memory/max_active (GiB)': '76.6', 'memory/max_allocated (GiB)': '76.6', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '64.22', 'tokens/total': 786673344, 'tokens/trainable': 290509888, 'epoch': '0.7111'}
 24%|████████████████████▊                                                                   | 415/1751 [6:57:21<22:07:50, 59.63s/it] 24%|████████████████████▉                                                                   | 416/1751 [6:58:21<22:12:53, 59.91s/it]                                                                                                                                     {'loss': '0.5285', 'grad_norm': '0.1992', 'learning_rate': '1.814e-05', 'ppl': '1.696', 'memory/max_active (GiB)': '77.15', 'memory/max_allocated (GiB)': '77.15', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '145.5', 'tokens/total': 788545792, 'tokens/trainable': 291166560, 'epoch': '0.7129'}
 24%|████████████████████▉                                                                   | 416/1751 [6:58:21<22:12:53, 59.91s/it] 24%|████████████████████▉                                                                   | 417/1751 [6:59:19<21:56:56, 59.23s/it]                                                                                                                                     {'loss': '0.5756', 'grad_norm': '0.208', 'learning_rate': '1.813e-05', 'ppl': '1.778', 'memory/max_active (GiB)': '68.9', 'memory/max_allocated (GiB)': '68.9', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '80.83', 'tokens/total': 790306240, 'tokens/trainable': 291832160, 'epoch': '0.7146'}
 24%|████████████████████▉                                                                   | 417/1751 [6:59:19<21:56:56, 59.23s/it] 24%|█████████████████████                                                                   | 418/1751 [7:00:21<22:12:46, 59.99s/it]                                                                                                                                     {'loss': '0.5168', 'grad_norm': '0.1846', 'learning_rate': '1.812e-05', 'ppl': '1.677', 'memory/max_active (GiB)': '70', 'memory/max_allocated (GiB)': '70', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '46.93', 'tokens/total': 792287040, 'tokens/trainable': 292600512, 'epoch': '0.7163'}
 24%|█████████████████████                                                                   | 418/1751 [7:00:21<22:12:46, 59.99s/it] 24%|█████████████████████                                                                   | 419/1751 [7:01:18<21:55:42, 59.27s/it]                                                                                                                                     {'loss': '0.5153', 'grad_norm': '0.1865', 'learning_rate': '1.811e-05', 'ppl': '1.674', 'memory/max_active (GiB)': '75.12', 'memory/max_allocated (GiB)': '75.12', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '73.85', 'tokens/total': 794106816, 'tokens/trainable': 293297120, 'epoch': '0.718'}
 24%|█████████████████████                                                                   | 419/1751 [7:01:18<21:55:42, 59.27s/it] 24%|█████████████████████                                                                   | 420/1751 [7:02:20<22:15:06, 60.19s/it]                                                                                                                                     {'loss': '0.5339', 'grad_norm': '0.1855', 'learning_rate': '1.81e-05', 'ppl': '1.706', 'memory/max_active (GiB)': '72.91', 'memory/max_allocated (GiB)': '72.91', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '18.03', 'tokens/total': 796099648, 'tokens/trainable': 294027968, 'epoch': '0.7197'}
 24%|█████████████████████                                                                   | 420/1751 [7:02:20<22:15:06, 60.19s/it] 24%|█████████████████████▏                                                                  | 421/1751 [7:03:20<22:10:36, 60.03s/it]                                                                                                                                     {'loss': '0.5189', 'grad_norm': '0.1914', 'learning_rate': '1.809e-05', 'ppl': '1.68', 'memory/max_active (GiB)': '75.45', 'memory/max_allocated (GiB)': '75.45', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '171.2', 'tokens/total': 797928448, 'tokens/trainable': 294732800, 'epoch': '0.7214'}
 24%|█████████████████████▏                                                                  | 421/1751 [7:03:20<22:10:36, 60.03s/it] 24%|█████████████████████▏                                                                  | 422/1751 [7:04:22<22:23:17, 60.65s/it]                                                                                                                                     {'loss': '0.5351', 'grad_norm': '0.1846', 'learning_rate': '1.808e-05', 'ppl': '1.708', 'memory/max_active (GiB)': '76.7', 'memory/max_allocated (GiB)': '76.7', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '47.42', 'tokens/total': 799883392, 'tokens/trainable': 295466400, 'epoch': '0.7231'}
 24%|█████████████████████▏                                                                  | 422/1751 [7:04:22<22:23:17, 60.65s/it] 24%|█████████████████████▎                                                                  | 423/1751 [7:05:21<22:09:09, 60.05s/it]                                                                                                                                     {'loss': '0.5502', 'grad_norm': '0.1992', 'learning_rate': '1.807e-05', 'ppl': '1.734', 'memory/max_active (GiB)': '72.83', 'memory/max_allocated (GiB)': '72.83', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '109.9', 'tokens/total': 801742144, 'tokens/trainable': 296178368, 'epoch': '0.7248'}
 24%|█████████████████████▎                                                                  | 423/1751 [7:05:21<22:09:09, 60.05s/it] 24%|█████████████████████▎                                                                  | 424/1751 [7:06:21<22:05:59, 59.95s/it]                                                                                                                                     {'loss': '0.5454', 'grad_norm': '0.1895', 'learning_rate': '1.805e-05', 'ppl': '1.725', 'memory/max_active (GiB)': '71.78', 'memory/max_allocated (GiB)': '71.78', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '96.05', 'tokens/total': 803646720, 'tokens/trainable': 296919360, 'epoch': '0.7266'}
 24%|█████████████████████▎                                                                  | 424/1751 [7:06:21<22:05:59, 59.95s/it] 24%|█████████████████████▎                                                                  | 425/1751 [7:07:23<22:19:23, 60.61s/it]                                                                                                                                     {'loss': '0.5197', 'grad_norm': '0.1904', 'learning_rate': '1.804e-05', 'ppl': '1.682', 'memory/max_active (GiB)': '75.65', 'memory/max_allocated (GiB)': '75.65', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '98.67', 'tokens/total': 805660224, 'tokens/trainable': 297697280, 'epoch': '0.7283'}
 24%|█████████████████████▎                                                                  | 425/1751 [7:07:23<22:19:23, 60.61s/it] 24%|█████████████████████▍                                                                  | 426/1751 [7:08:23<22:13:52, 60.40s/it]                                                                                                                                     {'loss': '0.5291', 'grad_norm': '0.1836', 'learning_rate': '1.803e-05', 'ppl': '1.697', 'memory/max_active (GiB)': '77.72', 'memory/max_allocated (GiB)': '77.72', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '10.95', 'tokens/total': 807570432, 'tokens/trainable': 298373312, 'epoch': '0.73'}
 24%|█████████████████████▍                                                                  | 426/1751 [7:08:23<22:13:52, 60.40s/it] 24%|█████████████████████▍                                                                  | 427/1751 [7:09:23<22:11:27, 60.34s/it]                                                                                                                                     {'loss': '0.5367', 'grad_norm': '0.1992', 'learning_rate': '1.802e-05', 'ppl': '1.71', 'memory/max_active (GiB)': '72.01', 'memory/max_allocated (GiB)': '72.01', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '32.24', 'tokens/total': 809466816, 'tokens/trainable': 299090048, 'epoch': '0.7317'}
 24%|█████████████████████▍                                                                  | 427/1751 [7:09:23<22:11:27, 60.34s/it] 24%|█████████████████████▌                                                                  | 428/1751 [7:10:24<22:16:15, 60.60s/it]                                                                                                                                     {'loss': '0.5318', 'grad_norm': '0.1982', 'learning_rate': '1.801e-05', 'ppl': '1.702', 'memory/max_active (GiB)': '76.07', 'memory/max_allocated (GiB)': '76.07', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '187.3', 'tokens/total': 811422720, 'tokens/trainable': 299798432, 'epoch': '0.7334'}
 24%|█████████████████████▌                                                                  | 428/1751 [7:10:24<22:16:15, 60.60s/it] 25%|█████████████████████▌                                                                  | 429/1751 [7:11:25<22:17:41, 60.71s/it]                                                                                                                                     {'loss': '0.5121', 'grad_norm': '0.1826', 'learning_rate': '1.8e-05', 'ppl': '1.669', 'memory/max_active (GiB)': '69.91', 'memory/max_allocated (GiB)': '69.91', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '23.93', 'tokens/total': 813326976, 'tokens/trainable': 300526560, 'epoch': '0.7351'}
 25%|█████████████████████▌                                                                  | 429/1751 [7:11:25<22:17:41, 60.71s/it] 25%|█████████████████████▌                                                                  | 430/1751 [7:12:27<22:21:50, 60.95s/it]                                                                                                                                     {'loss': '0.5256', 'grad_norm': '0.1875', 'learning_rate': '1.799e-05', 'ppl': '1.692', 'memory/max_active (GiB)': '72.38', 'memory/max_allocated (GiB)': '72.38', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '32.69', 'tokens/total': 815273024, 'tokens/trainable': 301251648, 'epoch': '0.7368'}
 25%|█████████████████████▌                                                                  | 430/1751 [7:12:27<22:21:50, 60.95s/it] 25%|█████████████████████▋                                                                  | 431/1751 [7:13:27<22:14:37, 60.66s/it]                                                                                                                                     {'loss': '0.508', 'grad_norm': '0.2012', 'learning_rate': '1.798e-05', 'ppl': '1.662', 'memory/max_active (GiB)': '78.86', 'memory/max_allocated (GiB)': '78.86', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '91.24', 'tokens/total': 817193088, 'tokens/trainable': 301953056, 'epoch': '0.7386'}
 25%|█████████████████████▋                                                                  | 431/1751 [7:13:27<22:14:37, 60.66s/it] 25%|█████████████████████▋                                                                  | 432/1751 [7:14:28<22:15:41, 60.76s/it]                                                                                                                                     {'loss': '0.5331', 'grad_norm': '0.2031', 'learning_rate': '1.796e-05', 'ppl': '1.704', 'memory/max_active (GiB)': '74.5', 'memory/max_allocated (GiB)': '74.5', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '114.4', 'tokens/total': 819138048, 'tokens/trainable': 302648576, 'epoch': '0.7403'}
 25%|█████████████████████▋                                                                  | 432/1751 [7:14:28<22:15:41, 60.76s/it] 25%|█████████████████████▊                                                                  | 433/1751 [7:15:29<22:19:04, 60.96s/it]                                                                                                                                     {'loss': '0.522', 'grad_norm': '0.209', 'learning_rate': '1.795e-05', 'ppl': '1.685', 'memory/max_active (GiB)': '74.61', 'memory/max_allocated (GiB)': '74.61', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '64.86', 'tokens/total': 821079936, 'tokens/trainable': 303389632, 'epoch': '0.742'}
 25%|█████████████████████▊                                                                  | 433/1751 [7:15:29<22:19:04, 60.96s/it] 25%|█████████████████████▊                                                                  | 434/1751 [7:16:29<22:14:12, 60.78s/it]                                                                                                                                     {'loss': '0.5496', 'grad_norm': '0.1904', 'learning_rate': '1.794e-05', 'ppl': '1.733', 'memory/max_active (GiB)': '76.11', 'memory/max_allocated (GiB)': '76.11', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '49.4', 'tokens/total': 822971712, 'tokens/trainable': 304128896, 'epoch': '0.7437'}
 25%|█████████████████████▊                                                                  | 434/1751 [7:16:29<22:14:12, 60.78s/it] 25%|█████████████████████▊                                                                  | 435/1751 [7:17:28<21:59:57, 60.18s/it]                                                                                                                                     {'loss': '0.5661', 'grad_norm': '0.2012', 'learning_rate': '1.793e-05', 'ppl': '1.761', 'memory/max_active (GiB)': '76.1', 'memory/max_allocated (GiB)': '76.1', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '94.4', 'tokens/total': 824792128, 'tokens/trainable': 304827168, 'epoch': '0.7454'}
 25%|█████████████████████▊                                                                  | 435/1751 [7:17:28<21:59:57, 60.18s/it] 25%|█████████████████████▉                                                                  | 436/1751 [7:18:29<22:04:42, 60.44s/it]                                                                                                                                     {'loss': '0.5537', 'grad_norm': '0.1943', 'learning_rate': '1.792e-05', 'ppl': '1.74', 'memory/max_active (GiB)': '72.28', 'memory/max_allocated (GiB)': '72.28', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '156.6', 'tokens/total': 826702144, 'tokens/trainable': 305528640, 'epoch': '0.7471'}
 25%|█████████████████████▉                                                                  | 436/1751 [7:18:29<22:04:42, 60.44s/it] 25%|█████████████████████▉                                                                  | 437/1751 [7:19:30<22:03:18, 60.43s/it]                                                                                                                                     {'loss': '0.5362', 'grad_norm': '0.1943', 'learning_rate': '1.791e-05', 'ppl': '1.71', 'memory/max_active (GiB)': '75.43', 'memory/max_allocated (GiB)': '75.43', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '110.2', 'tokens/total': 828633280, 'tokens/trainable': 306301728, 'epoch': '0.7488'}
 25%|█████████████████████▉                                                                  | 437/1751 [7:19:30<22:03:18, 60.43s/it] 25%|██████████████████████                                                                  | 438/1751 [7:20:28<21:48:38, 59.80s/it]                                                                                                                                     {'loss': '0.5389', 'grad_norm': '0.207', 'learning_rate': '1.79e-05', 'ppl': '1.714', 'memory/max_active (GiB)': '70.68', 'memory/max_allocated (GiB)': '70.68', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '90.03', 'tokens/total': 830481920, 'tokens/trainable': 306989312, 'epoch': '0.7506'}
 25%|██████████████████████                                                                  | 438/1751 [7:20:28<21:48:38, 59.80s/it] 25%|██████████████████████                                                                  | 439/1751 [7:21:27<21:46:28, 59.75s/it]                                                                                                                                     {'loss': '0.574', 'grad_norm': '0.2178', 'learning_rate': '1.788e-05', 'ppl': '1.775', 'memory/max_active (GiB)': '73.65', 'memory/max_allocated (GiB)': '73.65', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '64.73', 'tokens/total': 832365440, 'tokens/trainable': 307684928, 'epoch': '0.7523'}
 25%|██████████████████████                                                                  | 439/1751 [7:21:28<21:46:28, 59.75s/it] 25%|██████████████████████                                                                  | 440/1751 [7:22:27<21:42:20, 59.60s/it]                                                                                                                                     {'loss': '0.5433', 'grad_norm': '0.2031', 'learning_rate': '1.787e-05', 'ppl': '1.722', 'memory/max_active (GiB)': '69.16', 'memory/max_allocated (GiB)': '69.16', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '26.17', 'tokens/total': 834202176, 'tokens/trainable': 308353056, 'epoch': '0.754'}
 25%|██████████████████████                                                                  | 440/1751 [7:22:27<21:42:20, 59.60s/it] 25%|██████████████████████▏                                                                 | 441/1751 [7:23:28<21:49:45, 59.99s/it]                                                                                                                                     {'loss': '0.5197', 'grad_norm': '0.1855', 'learning_rate': '1.786e-05', 'ppl': '1.682', 'memory/max_active (GiB)': '75.54', 'memory/max_allocated (GiB)': '75.54', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '134.9', 'tokens/total': 836157504, 'tokens/trainable': 309069920, 'epoch': '0.7557'}
 25%|██████████████████████▏                                                                 | 441/1751 [7:23:28<21:49:45, 59.99s/it] 25%|██████████████████████▏                                                                 | 442/1751 [7:24:25<21:30:06, 59.13s/it]                                                                                                                                     {'loss': '0.5763', 'grad_norm': '0.2178', 'learning_rate': '1.785e-05', 'ppl': '1.779', 'memory/max_active (GiB)': '75.91', 'memory/max_allocated (GiB)': '75.91', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '165.5', 'tokens/total': 837944448, 'tokens/trainable': 309734016, 'epoch': '0.7574'}
 25%|██████████████████████▏                                                                 | 442/1751 [7:24:25<21:30:06, 59.13s/it] 25%|██████████████████████▎                                                                 | 443/1751 [7:25:23<21:19:51, 58.71s/it]                                                                                                                                     {'loss': '0.5435', 'grad_norm': '0.2217', 'learning_rate': '1.784e-05', 'ppl': '1.722', 'memory/max_active (GiB)': '76.19', 'memory/max_allocated (GiB)': '76.19', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '87.47', 'tokens/total': 839777472, 'tokens/trainable': 310382432, 'epoch': '0.7591'}
 25%|██████████████████████▎                                                                 | 443/1751 [7:25:23<21:19:51, 58.71s/it] 25%|██████████████████████▎                                                                 | 444/1751 [7:26:20<21:08:07, 58.22s/it]                                                                                                                                     {'loss': '0.5403', 'grad_norm': '0.2061', 'learning_rate': '1.783e-05', 'ppl': '1.716', 'memory/max_active (GiB)': '73.51', 'memory/max_allocated (GiB)': '73.51', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '162.7', 'tokens/total': 841576704, 'tokens/trainable': 311049984, 'epoch': '0.7608'}
 25%|██████████████████████▎                                                                 | 444/1751 [7:26:20<21:08:07, 58.22s/it] 25%|██████████████████████▎                                                                 | 445/1751 [7:27:21<21:28:35, 59.20s/it]                                                                                                                                     {'loss': '0.5308', 'grad_norm': '0.1904', 'learning_rate': '1.781e-05', 'ppl': '1.7', 'memory/max_active (GiB)': '76.21', 'memory/max_allocated (GiB)': '76.21', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '55.96', 'tokens/total': 843462016, 'tokens/trainable': 311756224, 'epoch': '0.7625'}
 25%|██████████████████████▎                                                                 | 445/1751 [7:27:21<21:28:35, 59.20s/it] 25%|██████████████████████▍                                                                 | 446/1751 [7:28:21<21:33:28, 59.47s/it]                                                                                                                                     {'loss': '0.5174', 'grad_norm': '0.2148', 'learning_rate': '1.78e-05', 'ppl': '1.678', 'memory/max_active (GiB)': '72.45', 'memory/max_allocated (GiB)': '72.45', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '59.23', 'tokens/total': 845318784, 'tokens/trainable': 312442400, 'epoch': '0.7643'}
 25%|██████████████████████▍                                                                 | 446/1751 [7:28:21<21:33:28, 59.47s/it] 26%|██████████████████████▍                                                                 | 447/1751 [7:29:22<21:42:29, 59.93s/it]                                                                                                                                     {'loss': '0.5106', 'grad_norm': '0.208', 'learning_rate': '1.779e-05', 'ppl': '1.666', 'memory/max_active (GiB)': '74.26', 'memory/max_allocated (GiB)': '74.26', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '39.01', 'tokens/total': 847242368, 'tokens/trainable': 313189312, 'epoch': '0.766'}
 26%|██████████████████████▍                                                                 | 447/1751 [7:29:22<21:42:29, 59.93s/it] 26%|██████████████████████▌                                                                 | 448/1751 [7:30:23<21:50:21, 60.34s/it]                                                                                                                                     {'loss': '0.5245', 'grad_norm': '0.2002', 'learning_rate': '1.778e-05', 'ppl': '1.69', 'memory/max_active (GiB)': '75.06', 'memory/max_allocated (GiB)': '75.06', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '57.77', 'tokens/total': 849170560, 'tokens/trainable': 313905984, 'epoch': '0.7677'}
 26%|██████████████████████▌                                                                 | 448/1751 [7:30:23<21:50:21, 60.34s/it] 26%|██████████████████████▌                                                                 | 449/1751 [7:31:23<21:41:04, 59.96s/it]                                                                                                                                     {'loss': '0.5137', 'grad_norm': '0.1934', 'learning_rate': '1.777e-05', 'ppl': '1.672', 'memory/max_active (GiB)': '72.97', 'memory/max_allocated (GiB)': '72.97', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '64.64', 'tokens/total': 851024256, 'tokens/trainable': 314552704, 'epoch': '0.7694'}
 26%|██████████████████████▌                                                                 | 449/1751 [7:31:23<21:41:04, 59.96s/it] 26%|██████████████████████▌                                                                 | 450/1751 [7:32:22<21:35:17, 59.74s/it]                                                                                                                                     {'loss': '0.5406', 'grad_norm': '0.1982', 'learning_rate': '1.775e-05', 'ppl': '1.717', 'memory/max_active (GiB)': '75.62', 'memory/max_allocated (GiB)': '75.62', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '48.65', 'tokens/total': 852870144, 'tokens/trainable': 315226112, 'epoch': '0.7711'}
 26%|██████████████████████▌                                                                 | 450/1751 [7:32:22<21:35:17, 59.74s/it] 26%|██████████████████████▋                                                                 | 451/1751 [7:33:21<21:30:01, 59.54s/it]                                                                                                                                     {'loss': '0.5589', 'grad_norm': '0.1953', 'learning_rate': '1.774e-05', 'ppl': '1.749', 'memory/max_active (GiB)': '73.21', 'memory/max_allocated (GiB)': '73.21', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '20.21', 'tokens/total': 854729728, 'tokens/trainable': 315951584, 'epoch': '0.7728'}
 26%|██████████████████████▋                                                                 | 451/1751 [7:33:21<21:30:01, 59.54s/it] 26%|██████████████████████▋                                                                 | 452/1751 [7:34:21<21:32:50, 59.72s/it]                                                                                                                                     {'loss': '0.5378', 'grad_norm': '0.1992', 'learning_rate': '1.773e-05', 'ppl': '1.712', 'memory/max_active (GiB)': '75.01', 'memory/max_allocated (GiB)': '75.01', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '150.8', 'tokens/total': 856656768, 'tokens/trainable': 316682464, 'epoch': '0.7745'}
 26%|██████████████████████▋                                                                 | 452/1751 [7:34:21<21:32:50, 59.72s/it] 26%|██████████████████████▊                                                                 | 453/1751 [7:35:19<21:18:31, 59.10s/it]                                                                                                                                     {'loss': '0.5732', 'grad_norm': '0.2168', 'learning_rate': '1.772e-05', 'ppl': '1.774', 'memory/max_active (GiB)': '70.03', 'memory/max_allocated (GiB)': '70.03', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '111.2', 'tokens/total': 858452416, 'tokens/trainable': 317353024, 'epoch': '0.7763'}
 26%|██████████████████████▊                                                                 | 453/1751 [7:35:19<21:18:31, 59.10s/it] 26%|██████████████████████▊                                                                 | 454/1751 [7:36:19<21:28:39, 59.61s/it]                                                                                                                                     {'loss': '0.4998', 'grad_norm': '0.1914', 'learning_rate': '1.771e-05', 'ppl': '1.648', 'memory/max_active (GiB)': '75.47', 'memory/max_allocated (GiB)': '75.47', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '116.3', 'tokens/total': 860410304, 'tokens/trainable': 318056224, 'epoch': '0.778'}
 26%|██████████████████████▊                                                                 | 454/1751 [7:36:19<21:28:39, 59.61s/it] 26%|██████████████████████▊                                                                 | 455/1751 [7:37:22<21:46:23, 60.48s/it]                                                                                                                                     {'loss': '0.5326', 'grad_norm': '0.2002', 'learning_rate': '1.769e-05', 'ppl': '1.703', 'memory/max_active (GiB)': '75.71', 'memory/max_allocated (GiB)': '75.71', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '23.76', 'tokens/total': 862368896, 'tokens/trainable': 318784736, 'epoch': '0.7797'}
 26%|██████████████████████▊                                                                 | 455/1751 [7:37:22<21:46:23, 60.48s/it] 26%|██████████████████████▉                                                                 | 456/1751 [7:38:24<21:57:09, 61.03s/it]                                                                                                                                     {'loss': '0.5137', 'grad_norm': '0.1748', 'learning_rate': '1.768e-05', 'ppl': '1.671', 'memory/max_active (GiB)': '76.08', 'memory/max_allocated (GiB)': '76.08', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '83.02', 'tokens/total': 864376832, 'tokens/trainable': 319544896, 'epoch': '0.7814'}
 26%|██████████████████████▉                                                                 | 456/1751 [7:38:24<21:57:09, 61.03s/it] 26%|██████████████████████▉                                                                 | 457/1751 [7:39:25<21:51:47, 60.83s/it]                                                                                                                                     {'loss': '0.5272', 'grad_norm': '0.2129', 'learning_rate': '1.767e-05', 'ppl': '1.694', 'memory/max_active (GiB)': '72.84', 'memory/max_allocated (GiB)': '72.84', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '116.3', 'tokens/total': 866308864, 'tokens/trainable': 320227040, 'epoch': '0.7831'}
 26%|██████████████████████▉                                                                 | 457/1751 [7:39:25<21:51:47, 60.83s/it] 26%|███████████████████████                                                                 | 458/1751 [7:40:24<21:38:30, 60.26s/it]                                                                                                                                     {'loss': '0.5358', 'grad_norm': '0.2031', 'learning_rate': '1.766e-05', 'ppl': '1.709', 'memory/max_active (GiB)': '77.72', 'memory/max_allocated (GiB)': '77.72', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '117', 'tokens/total': 868161472, 'tokens/trainable': 320922464, 'epoch': '0.7848'}
 26%|███████████████████████                                                                 | 458/1751 [7:40:24<21:38:30, 60.26s/it] 26%|███████████████████████                                                                 | 459/1751 [7:41:25<21:47:23, 60.71s/it]                                                                                                                                     {'loss': '0.5089', 'grad_norm': '0.1982', 'learning_rate': '1.765e-05', 'ppl': '1.663', 'memory/max_active (GiB)': '75.34', 'memory/max_allocated (GiB)': '75.34', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '83.53', 'tokens/total': 870119488, 'tokens/trainable': 321647360, 'epoch': '0.7865'}
 26%|███████████████████████                                                                 | 459/1751 [7:41:25<21:47:23, 60.71s/it] 26%|███████████████████████                                                                 | 460/1751 [7:42:26<21:48:08, 60.80s/it]                                                                                                                                     {'loss': '0.4995', 'grad_norm': '0.1992', 'learning_rate': '1.763e-05', 'ppl': '1.648', 'memory/max_active (GiB)': '71.37', 'memory/max_allocated (GiB)': '71.37', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '73.66', 'tokens/total': 872065792, 'tokens/trainable': 322372480, 'epoch': '0.7883'}
 26%|███████████████████████                                                                 | 460/1751 [7:42:26<21:48:08, 60.80s/it] 26%|███████████████████████▏                                                                | 461/1751 [7:43:28<21:55:10, 61.17s/it]                                                                                                                                     {'loss': '0.5214', 'grad_norm': '0.1768', 'learning_rate': '1.762e-05', 'ppl': '1.684', 'memory/max_active (GiB)': '75.8', 'memory/max_allocated (GiB)': '75.8', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '138.1', 'tokens/total': 874051328, 'tokens/trainable': 323100416, 'epoch': '0.79'}
 26%|███████████████████████▏                                                                | 461/1751 [7:43:28<21:55:10, 61.17s/it] 26%|███████████████████████▏                                                                | 462/1751 [7:44:27<21:40:03, 60.51s/it]                                                                                                                                     {'loss': '0.5405', 'grad_norm': '0.2148', 'learning_rate': '1.761e-05', 'ppl': '1.717', 'memory/max_active (GiB)': '76.64', 'memory/max_allocated (GiB)': '76.64', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '31.64', 'tokens/total': 875874560, 'tokens/trainable': 323751808, 'epoch': '0.7917'}
 26%|███████████████████████▏                                                                | 462/1751 [7:44:27<21:40:03, 60.51s/it] 26%|███████████████████████▎                                                                | 463/1751 [7:45:23<21:10:57, 59.21s/it]                                                                                                                                     {'loss': '0.556', 'grad_norm': '0.21', 'learning_rate': '1.76e-05', 'ppl': '1.744', 'memory/max_active (GiB)': '70.17', 'memory/max_allocated (GiB)': '70.17', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '119.3', 'tokens/total': 877611904, 'tokens/trainable': 324398912, 'epoch': '0.7934'}
 26%|███████████████████████▎                                                                | 463/1751 [7:45:23<21:10:57, 59.21s/it] 26%|███████████████████████▎                                                                | 464/1751 [7:46:24<21:19:24, 59.65s/it]                                                                                                                                     {'loss': '0.5237', 'grad_norm': '0.2031', 'learning_rate': '1.758e-05', 'ppl': '1.688', 'memory/max_active (GiB)': '74.91', 'memory/max_allocated (GiB)': '74.91', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '51.99', 'tokens/total': 879586880, 'tokens/trainable': 325123008, 'epoch': '0.7951'}
 26%|███████████████████████▎                                                                | 464/1751 [7:46:24<21:19:24, 59.65s/it] 27%|███████████████████████▎                                                                | 465/1751 [7:47:25<21:24:49, 59.95s/it]                                                                                                                                     {'loss': '0.5381', 'grad_norm': '0.2041', 'learning_rate': '1.757e-05', 'ppl': '1.713', 'memory/max_active (GiB)': '72.65', 'memory/max_allocated (GiB)': '72.65', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '42.3', 'tokens/total': 881503488, 'tokens/trainable': 325827488, 'epoch': '0.7968'}
 27%|███████████████████████▎                                                                | 465/1751 [7:47:25<21:24:49, 59.95s/it] 27%|███████████████████████▍                                                                | 466/1751 [7:48:27<21:35:15, 60.48s/it]                                                                                                                                     {'loss': '0.5257', 'grad_norm': '0.1865', 'learning_rate': '1.756e-05', 'ppl': '1.692', 'memory/max_active (GiB)': '77.4', 'memory/max_allocated (GiB)': '77.4', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '164', 'tokens/total': 883437056, 'tokens/trainable': 326565088, 'epoch': '0.7985'}
 27%|███████████████████████▍                                                                | 466/1751 [7:48:27<21:35:15, 60.48s/it] 27%|███████████████████████▍                                                                | 467/1751 [7:49:26<21:30:10, 60.29s/it]                                                                                                                                     {'loss': '0.5178', 'grad_norm': '0.1904', 'learning_rate': '1.755e-05', 'ppl': '1.678', 'memory/max_active (GiB)': '73.82', 'memory/max_allocated (GiB)': '73.82', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '118.9', 'tokens/total': 885333184, 'tokens/trainable': 327280672, 'epoch': '0.8002'}
 27%|███████████████████████▍                                                                | 467/1751 [7:49:26<21:30:10, 60.29s/it] 27%|███████████████████████▌                                                                | 468/1751 [7:50:26<21:27:42, 60.22s/it]                                                                                                                                     {'loss': '0.511', 'grad_norm': '0.1982', 'learning_rate': '1.753e-05', 'ppl': '1.667', 'memory/max_active (GiB)': '75.84', 'memory/max_allocated (GiB)': '75.84', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '96.87', 'tokens/total': 887231360, 'tokens/trainable': 327966400, 'epoch': '0.802'}
 27%|███████████████████████▌                                                                | 468/1751 [7:50:26<21:27:42, 60.22s/it] 27%|███████████████████████▌                                                                | 469/1751 [7:51:27<21:26:25, 60.21s/it]                                                                                                                                     {'loss': '0.5174', 'grad_norm': '0.1846', 'learning_rate': '1.752e-05', 'ppl': '1.678', 'memory/max_active (GiB)': '75.09', 'memory/max_allocated (GiB)': '75.09', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '71.76', 'tokens/total': 889201408, 'tokens/trainable': 328678336, 'epoch': '0.8037'}
 27%|███████████████████████▌                                                                | 469/1751 [7:51:27<21:26:25, 60.21s/it] 27%|███████████████████████▌                                                                | 470/1751 [7:52:28<21:29:56, 60.42s/it]                                                                                                                                     {'loss': '0.5088', 'grad_norm': '0.1855', 'learning_rate': '1.751e-05', 'ppl': '1.663', 'memory/max_active (GiB)': '71.6', 'memory/max_allocated (GiB)': '71.6', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '78.48', 'tokens/total': 891158784, 'tokens/trainable': 329400256, 'epoch': '0.8054'}
 27%|███████████████████████▌                                                                | 470/1751 [7:52:28<21:29:56, 60.42s/it] 27%|███████████████████████▋                                                                | 471/1751 [7:53:29<21:36:51, 60.79s/it]                                                                                                                                     {'loss': '0.5317', 'grad_norm': '0.2119', 'learning_rate': '1.75e-05', 'ppl': '1.702', 'memory/max_active (GiB)': '75.15', 'memory/max_allocated (GiB)': '75.15', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '38.91', 'tokens/total': 893138624, 'tokens/trainable': 330148608, 'epoch': '0.8071'}
 27%|███████████████████████▋                                                                | 471/1751 [7:53:29<21:36:51, 60.79s/it] 27%|███████████████████████▋                                                                | 472/1751 [7:54:30<21:39:17, 60.95s/it]                                                                                                                                     {'loss': '0.561', 'grad_norm': '0.21', 'learning_rate': '1.749e-05', 'ppl': '1.752', 'memory/max_active (GiB)': '74.39', 'memory/max_allocated (GiB)': '74.39', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '27.32', 'tokens/total': 895107264, 'tokens/trainable': 330841056, 'epoch': '0.8088'}
 27%|███████████████████████▋                                                                | 472/1751 [7:54:31<21:39:17, 60.95s/it] 27%|███████████████████████▊                                                                | 473/1751 [7:55:32<21:41:38, 61.11s/it]                                                                                                                                     {'loss': '0.516', 'grad_norm': '0.1846', 'learning_rate': '1.747e-05', 'ppl': '1.675', 'memory/max_active (GiB)': '71.63', 'memory/max_allocated (GiB)': '71.63', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '78.07', 'tokens/total': 897102464, 'tokens/trainable': 331559136, 'epoch': '0.8105'}
 27%|███████████████████████▊                                                                | 473/1751 [7:55:32<21:41:38, 61.11s/it] 27%|███████████████████████▊                                                                | 474/1751 [7:56:33<21:39:22, 61.05s/it]                                                                                                                                     {'loss': '0.5225', 'grad_norm': '0.2207', 'learning_rate': '1.746e-05', 'ppl': '1.686', 'memory/max_active (GiB)': '75.96', 'memory/max_allocated (GiB)': '75.96', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '40.43', 'tokens/total': 899030976, 'tokens/trainable': 332272352, 'epoch': '0.8122'}
 27%|███████████████████████▊                                                                | 474/1751 [7:56:33<21:39:22, 61.05s/it] 27%|███████████████████████▊                                                                | 475/1751 [7:57:34<21:41:03, 61.18s/it]                                                                                                                                     {'loss': '0.5523', 'grad_norm': '0.2139', 'learning_rate': '1.745e-05', 'ppl': '1.737', 'memory/max_active (GiB)': '75.88', 'memory/max_allocated (GiB)': '75.88', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '73.53', 'tokens/total': 900989952, 'tokens/trainable': 333004704, 'epoch': '0.814'}
 27%|███████████████████████▊                                                                | 475/1751 [7:57:34<21:41:03, 61.18s/it] 27%|███████████████████████▉                                                                | 476/1751 [7:58:36<21:40:47, 61.21s/it]                                                                                                                                     {'loss': '0.5321', 'grad_norm': '0.209', 'learning_rate': '1.743e-05', 'ppl': '1.702', 'memory/max_active (GiB)': '76.72', 'memory/max_allocated (GiB)': '76.72', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '70.69', 'tokens/total': 902984192, 'tokens/trainable': 333746720, 'epoch': '0.8157'}
 27%|███████████████████████▉                                                                | 476/1751 [7:58:36<21:40:47, 61.21s/it] 27%|███████████████████████▉                                                                | 477/1751 [7:59:33<21:13:48, 59.99s/it]                                                                                                                                     {'loss': '0.5329', 'grad_norm': '0.209', 'learning_rate': '1.742e-05', 'ppl': '1.704', 'memory/max_active (GiB)': '73.18', 'memory/max_allocated (GiB)': '73.18', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '76.16', 'tokens/total': 904787136, 'tokens/trainable': 334392960, 'epoch': '0.8174'}
 27%|███████████████████████▉                                                                | 477/1751 [7:59:33<21:13:48, 59.99s/it] 27%|████████████████████████                                                                | 478/1751 [8:00:36<21:34:17, 61.00s/it]                                                                                                                                     {'loss': '0.4818', 'grad_norm': '0.1904', 'learning_rate': '1.741e-05', 'ppl': '1.619', 'memory/max_active (GiB)': '68.72', 'memory/max_allocated (GiB)': '68.72', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '84.73', 'tokens/total': 906825152, 'tokens/trainable': 335137984, 'epoch': '0.8191'}
 27%|████████████████████████                                                                | 478/1751 [8:00:36<21:34:17, 61.00s/it] 27%|████████████████████████                                                                | 479/1751 [8:01:35<21:21:25, 60.44s/it]                                                                                                                                     {'loss': '0.52', 'grad_norm': '0.208', 'learning_rate': '1.74e-05', 'ppl': '1.682', 'memory/max_active (GiB)': '75.46', 'memory/max_allocated (GiB)': '75.46', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '40.51', 'tokens/total': 908721536, 'tokens/trainable': 335799584, 'epoch': '0.8208'}
 27%|████████████████████████                                                                | 479/1751 [8:01:35<21:21:25, 60.44s/it] 27%|████████████████████████                                                                | 480/1751 [8:02:35<21:14:42, 60.17s/it]                                                                                                                                     {'loss': '0.5341', 'grad_norm': '0.2041', 'learning_rate': '1.738e-05', 'ppl': '1.706', 'memory/max_active (GiB)': '71.7', 'memory/max_allocated (GiB)': '71.7', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '103', 'tokens/total': 910638336, 'tokens/trainable': 336507392, 'epoch': '0.8225'}
 27%|████████████████████████                                                                | 480/1751 [8:02:35<21:14:42, 60.17s/it] 27%|████████████████████████▏                                                               | 481/1751 [8:03:32<20:55:09, 59.30s/it]                                                                                                                                     {'loss': '0.5171', 'grad_norm': '0.1904', 'learning_rate': '1.737e-05', 'ppl': '1.677', 'memory/max_active (GiB)': '72.48', 'memory/max_allocated (GiB)': '72.48', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '124.9', 'tokens/total': 912507840, 'tokens/trainable': 337174304, 'epoch': '0.8242'}
 27%|████████████████████████▏                                                               | 481/1751 [8:03:32<20:55:09, 59.30s/it] 28%|████████████████████████▏                                                               | 482/1751 [8:04:32<20:56:27, 59.41s/it]                                                                                                                                     {'loss': '0.5074', 'grad_norm': '0.1885', 'learning_rate': '1.736e-05', 'ppl': '1.661', 'memory/max_active (GiB)': '73.71', 'memory/max_allocated (GiB)': '73.71', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '58.08', 'tokens/total': 914442496, 'tokens/trainable': 337866240, 'epoch': '0.826'}
 28%|████████████████████████▏                                                               | 482/1751 [8:04:32<20:56:27, 59.41s/it] 28%|████████████████████████▎                                                               | 483/1751 [8:05:32<21:03:15, 59.78s/it]                                                                                                                                     {'loss': '0.4688', 'grad_norm': '0.1797', 'learning_rate': '1.735e-05', 'ppl': '1.598', 'memory/max_active (GiB)': '76.02', 'memory/max_allocated (GiB)': '76.02', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '126.1', 'tokens/total': 916325312, 'tokens/trainable': 338588064, 'epoch': '0.8277'}
 28%|████████████████████████▎                                                               | 483/1751 [8:05:32<21:03:15, 59.78s/it] 28%|████████████████████████▎                                                               | 484/1751 [8:06:31<20:56:31, 59.50s/it]                                                                                                                                     {'loss': '0.5056', 'grad_norm': '0.1973', 'learning_rate': '1.733e-05', 'ppl': '1.658', 'memory/max_active (GiB)': '73.03', 'memory/max_allocated (GiB)': '73.03', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '59.52', 'tokens/total': 918136128, 'tokens/trainable': 339225696, 'epoch': '0.8294'}
 28%|████████████████████████▎                                                               | 484/1751 [8:06:31<20:56:31, 59.50s/it] 28%|████████████████████████▎                                                               | 485/1751 [8:07:33<21:09:43, 60.18s/it]                                                                                                                                     {'loss': '0.4923', 'grad_norm': '0.1865', 'learning_rate': '1.732e-05', 'ppl': '1.636', 'memory/max_active (GiB)': '75.4', 'memory/max_allocated (GiB)': '75.4', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '183.7', 'tokens/total': 920047296, 'tokens/trainable': 339949280, 'epoch': '0.8311'}
 28%|████████████████████████▎                                                               | 485/1751 [8:07:33<21:09:43, 60.18s/it] 28%|████████████████████████▍                                                               | 486/1751 [8:08:35<21:23:06, 60.86s/it]                                                                                                                                     {'loss': '0.5098', 'grad_norm': '0.1836', 'learning_rate': '1.731e-05', 'ppl': '1.665', 'memory/max_active (GiB)': '75.92', 'memory/max_allocated (GiB)': '75.92', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '27.69', 'tokens/total': 922033280, 'tokens/trainable': 340718720, 'epoch': '0.8328'}
 28%|████████████████████████▍                                                               | 486/1751 [8:08:35<21:23:06, 60.86s/it] 28%|████████████████████████▍                                                               | 487/1751 [8:09:34<21:06:01, 60.10s/it]                                                                                                                                     {'loss': '0.5243', 'grad_norm': '0.1875', 'learning_rate': '1.729e-05', 'ppl': '1.689', 'memory/max_active (GiB)': '76.56', 'memory/max_allocated (GiB)': '76.56', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '231.4', 'tokens/total': 923866624, 'tokens/trainable': 341391424, 'epoch': '0.8345'}
 28%|████████████████████████▍                                                               | 487/1751 [8:09:34<21:06:01, 60.10s/it] 28%|████████████████████████▌                                                               | 488/1751 [8:10:35<21:12:16, 60.44s/it]                                                                                                                                     {'loss': '0.5193', 'grad_norm': '0.1953', 'learning_rate': '1.728e-05', 'ppl': '1.681', 'memory/max_active (GiB)': '75.88', 'memory/max_allocated (GiB)': '75.88', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '66.57', 'tokens/total': 925842176, 'tokens/trainable': 342100608, 'epoch': '0.8362'}
 28%|████████████████████████▌                                                               | 488/1751 [8:10:35<21:12:16, 60.44s/it] 28%|████████████████████████▌                                                               | 489/1751 [8:11:34<21:02:26, 60.02s/it]                                                                                                                                     {'loss': '0.5781', 'grad_norm': '0.2021', 'learning_rate': '1.727e-05', 'ppl': '1.783', 'memory/max_active (GiB)': '68.41', 'memory/max_allocated (GiB)': '68.41', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '90.43', 'tokens/total': 927725632, 'tokens/trainable': 342782080, 'epoch': '0.8379'}
 28%|████████████████████████▌                                                               | 489/1751 [8:11:34<21:02:26, 60.02s/it] 28%|████████████████████████▋                                                               | 490/1751 [8:12:33<20:56:12, 59.77s/it]                                                                                                                                     {'loss': '0.5706', 'grad_norm': '0.2236', 'learning_rate': '1.726e-05', 'ppl': '1.769', 'memory/max_active (GiB)': '65.27', 'memory/max_allocated (GiB)': '65.27', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '188.6', 'tokens/total': 929588416, 'tokens/trainable': 343494752, 'epoch': '0.8397'}
 28%|████████████████████████▋                                                               | 490/1751 [8:12:33<20:56:12, 59.77s/it] 28%|████████████████████████▋                                                               | 491/1751 [8:13:33<20:53:27, 59.69s/it]                                                                                                                                     {'loss': '0.5169', 'grad_norm': '0.2021', 'learning_rate': '1.724e-05', 'ppl': '1.677', 'memory/max_active (GiB)': '69.34', 'memory/max_allocated (GiB)': '69.34', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '53.42', 'tokens/total': 931461568, 'tokens/trainable': 344179552, 'epoch': '0.8414'}
 28%|████████████████████████▋                                                               | 491/1751 [8:13:33<20:53:27, 59.69s/it] 28%|████████████████████████▋                                                               | 492/1751 [8:14:32<20:51:11, 59.63s/it]                                                                                                                                     {'loss': '0.548', 'grad_norm': '0.2051', 'learning_rate': '1.723e-05', 'ppl': '1.73', 'memory/max_active (GiB)': '75.63', 'memory/max_allocated (GiB)': '75.63', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '151.2', 'tokens/total': 933370944, 'tokens/trainable': 344859392, 'epoch': '0.8431'}
 28%|████████████████████████▋                                                               | 492/1751 [8:14:32<20:51:11, 59.63s/it] 28%|████████████████████████▊                                                               | 493/1751 [8:15:31<20:42:44, 59.27s/it]                                                                                                                                     {'loss': '0.5463', 'grad_norm': '0.2002', 'learning_rate': '1.722e-05', 'ppl': '1.727', 'memory/max_active (GiB)': '74.11', 'memory/max_allocated (GiB)': '74.11', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '19.83', 'tokens/total': 935193280, 'tokens/trainable': 345561056, 'epoch': '0.8448'}
 28%|████████████████████████▊                                                               | 493/1751 [8:15:31<20:42:44, 59.27s/it] 28%|████████████████████████▊                                                               | 494/1751 [8:16:31<20:46:25, 59.50s/it]                                                                                                                                     {'loss': '0.5133', 'grad_norm': '0.1953', 'learning_rate': '1.72e-05', 'ppl': '1.671', 'memory/max_active (GiB)': '75.1', 'memory/max_allocated (GiB)': '75.1', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '36.64', 'tokens/total': 937093184, 'tokens/trainable': 346280960, 'epoch': '0.8465'}
 28%|████████████████████████▊                                                               | 494/1751 [8:16:31<20:46:25, 59.50s/it] 28%|████████████████████████▉                                                               | 495/1751 [8:17:31<20:52:40, 59.84s/it]                                                                                                                                     {'loss': '0.5169', 'grad_norm': '0.1992', 'learning_rate': '1.719e-05', 'ppl': '1.677', 'memory/max_active (GiB)': '72.96', 'memory/max_allocated (GiB)': '72.96', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '135.3', 'tokens/total': 939003328, 'tokens/trainable': 346982176, 'epoch': '0.8482'}
 28%|████████████████████████▉                                                               | 495/1751 [8:17:31<20:52:40, 59.84s/it] 28%|████████████████████████▉                                                               | 496/1751 [8:18:31<20:53:00, 59.90s/it]                                                                                                                                     {'loss': '0.5778', 'grad_norm': '0.21', 'learning_rate': '1.718e-05', 'ppl': '1.782', 'memory/max_active (GiB)': '67.38', 'memory/max_allocated (GiB)': '67.38', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '43.98', 'tokens/total': 940880768, 'tokens/trainable': 347674176, 'epoch': '0.8499'}
 28%|████████████████████████▉                                                               | 496/1751 [8:18:31<20:53:00, 59.90s/it] 28%|████████████████████████▉                                                               | 497/1751 [8:19:32<20:56:04, 60.10s/it]                                                                                                                                     {'loss': '0.4933', 'grad_norm': '0.1934', 'learning_rate': '1.716e-05', 'ppl': '1.638', 'memory/max_active (GiB)': '73.52', 'memory/max_allocated (GiB)': '73.52', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '20.16', 'tokens/total': 942804928, 'tokens/trainable': 348390848, 'epoch': '0.8517'}
 28%|████████████████████████▉                                                               | 497/1751 [8:19:32<20:56:04, 60.10s/it] 28%|█████████████████████████                                                               | 498/1751 [8:20:30<20:44:19, 59.58s/it]                                                                                                                                     {'loss': '0.5136', 'grad_norm': '0.2061', 'learning_rate': '1.715e-05', 'ppl': '1.671', 'memory/max_active (GiB)': '74.51', 'memory/max_allocated (GiB)': '74.51', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '68.64', 'tokens/total': 944658624, 'tokens/trainable': 349066400, 'epoch': '0.8534'}
 28%|█████████████████████████                                                               | 498/1751 [8:20:30<20:44:19, 59.58s/it] 28%|█████████████████████████                                                               | 499/1751 [8:21:29<20:37:17, 59.30s/it]                                                                                                                                     {'loss': '0.5305', 'grad_norm': '0.2041', 'learning_rate': '1.714e-05', 'ppl': '1.7', 'memory/max_active (GiB)': '77.15', 'memory/max_allocated (GiB)': '77.15', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '91.15', 'tokens/total': 946586688, 'tokens/trainable': 349761472, 'epoch': '0.8551'}
 28%|█████████████████████████                                                               | 499/1751 [8:21:29<20:37:17, 59.30s/it] 29%|█████████████████████████▏                                                              | 500/1751 [8:22:29<20:38:51, 59.42s/it]                                                                                                                                     {'loss': '0.5162', 'grad_norm': '0.1914', 'learning_rate': '1.712e-05', 'ppl': '1.676', 'memory/max_active (GiB)': '76.5', 'memory/max_allocated (GiB)': '76.5', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '74.82', 'tokens/total': 948601664, 'tokens/trainable': 350507168, 'epoch': '0.8568'}
 29%|█████████████████████████▏                                                              | 500/1751 [8:22:29<20:38:51, 59.42s/it] 29%|█████████████████████████▏                                                              | 501/1751 [8:23:25<20:19:08, 58.52s/it]                                                                                                                                     {'loss': '0.5415', 'grad_norm': '0.2158', 'learning_rate': '1.711e-05', 'ppl': '1.719', 'memory/max_active (GiB)': '70.76', 'memory/max_allocated (GiB)': '70.76', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '61.84', 'tokens/total': 950496192, 'tokens/trainable': 351187744, 'epoch': '0.8585'}
 29%|█████████████████████████▏                                                              | 501/1751 [8:23:25<20:19:08, 58.52s/it] 29%|█████████████████████████▏                                                              | 502/1751 [8:24:26<20:33:39, 59.26s/it]                                                                                                                                     {'loss': '0.4987', 'grad_norm': '0.1846', 'learning_rate': '1.71e-05', 'ppl': '1.647', 'memory/max_active (GiB)': '74.56', 'memory/max_allocated (GiB)': '74.56', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '15.04', 'tokens/total': 952508480, 'tokens/trainable': 351920736, 'epoch': '0.8602'}
 29%|█████████████████████████▏                                                              | 502/1751 [8:24:26<20:33:39, 59.26s/it] 29%|█████████████████████████▎                                                              | 503/1751 [8:25:26<20:35:29, 59.40s/it]                                                                                                                                     {'loss': '0.5356', 'grad_norm': '0.1953', 'learning_rate': '1.708e-05', 'ppl': '1.709', 'memory/max_active (GiB)': '70.27', 'memory/max_allocated (GiB)': '70.27', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '100.7', 'tokens/total': 954365440, 'tokens/trainable': 352606560, 'epoch': '0.8619'}
 29%|█████████████████████████▎                                                              | 503/1751 [8:25:26<20:35:29, 59.40s/it] 29%|█████████████████████████▎                                                              | 504/1751 [8:26:27<20:44:36, 59.89s/it]                                                                                                                                     {'loss': '0.522', 'grad_norm': '0.1885', 'learning_rate': '1.707e-05', 'ppl': '1.685', 'memory/max_active (GiB)': '75.78', 'memory/max_allocated (GiB)': '75.78', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '54.61', 'tokens/total': 956279104, 'tokens/trainable': 353318560, 'epoch': '0.8636'}
 29%|█████████████████████████▎                                                              | 504/1751 [8:26:27<20:44:36, 59.89s/it] 29%|█████████████████████████▍                                                              | 505/1751 [8:27:29<20:57:38, 60.56s/it]                                                                                                                                     {'loss': '0.5146', 'grad_norm': '0.1895', 'learning_rate': '1.706e-05', 'ppl': '1.673', 'memory/max_active (GiB)': '77.27', 'memory/max_allocated (GiB)': '77.27', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '61.22', 'tokens/total': 958258240, 'tokens/trainable': 354026944, 'epoch': '0.8654'}
 29%|█████████████████████████▍                                                              | 505/1751 [8:27:29<20:57:38, 60.56s/it] 29%|█████████████████████████▍                                                              | 506/1751 [8:28:30<20:56:36, 60.56s/it]                                                                                                                                     {'loss': '0.547', 'grad_norm': '0.1992', 'learning_rate': '1.704e-05', 'ppl': '1.728', 'memory/max_active (GiB)': '75.8', 'memory/max_allocated (GiB)': '75.8', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '81.59', 'tokens/total': 960193024, 'tokens/trainable': 354767168, 'epoch': '0.8671'}
 29%|█████████████████████████▍                                                              | 506/1751 [8:28:30<20:56:36, 60.56s/it] 29%|█████████████████████████▍                                                              | 507/1751 [8:29:30<20:55:07, 60.54s/it]                                                                                                                                     {'loss': '0.536', 'grad_norm': '0.1973', 'learning_rate': '1.703e-05', 'ppl': '1.709', 'memory/max_active (GiB)': '69.64', 'memory/max_allocated (GiB)': '69.64', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '77.47', 'tokens/total': 962129344, 'tokens/trainable': 355494880, 'epoch': '0.8688'}
 29%|█████████████████████████▍                                                              | 507/1751 [8:29:30<20:55:07, 60.54s/it] 29%|█████████████████████████▌                                                              | 508/1751 [8:30:30<20:52:06, 60.44s/it]                                                                                                                                     {'loss': '0.5346', 'grad_norm': '0.2061', 'learning_rate': '1.702e-05', 'ppl': '1.707', 'memory/max_active (GiB)': '73.31', 'memory/max_allocated (GiB)': '73.31', 'memory/device_reserved (GiB)': '106.5', 'tokens/train_per_sec_per_gpu': '23.5', 'tokens/total': 964021120, 'tokens/trainable': 356213152, 'epoch': '0.8705'}
 29%|█████████████████████████▌                                                              | 508/1751 [8:30:30<20:52:06, 60.44s/it] 29%|█████████████████████████▌                                                              | 509/1751 [8:31:29<20:42:01, 60.00s/it]                                                                                                                                     {'loss': '0.5604', 'grad_norm': '0.1982', 'learning_rate': '1.7e-05', 'ppl': '1.751', 'memory/max_active (GiB)': '74.8', 'memory/max_allocated (GiB)': '74.8', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '123.8', 'tokens/total': 965851584, 'tokens/trainable': 356899232, 'epoch': '0.8722'}
 29%|█████████████████████████▌                                                              | 509/1751 [8:31:29<20:42:01, 60.00s/it] 29%|█████████████████████████▋                                                              | 510/1751 [8:32:29<20:39:56, 59.95s/it]                                                                                                                                     {'loss': '0.5876', 'grad_norm': '0.208', 'learning_rate': '1.699e-05', 'ppl': '1.8', 'memory/max_active (GiB)': '71.01', 'memory/max_allocated (GiB)': '71.01', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '135', 'tokens/total': 967736320, 'tokens/trainable': 357626144, 'epoch': '0.8739'}
 29%|█████████████████████████▋                                                              | 510/1751 [8:32:29<20:39:56, 59.95s/it] 29%|█████████████████████████▋                                                              | 511/1751 [8:33:29<20:39:03, 59.95s/it]                                                                                                                                     {'loss': '0.5366', 'grad_norm': '0.2002', 'learning_rate': '1.698e-05', 'ppl': '1.71', 'memory/max_active (GiB)': '75.06', 'memory/max_allocated (GiB)': '75.06', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '58.01', 'tokens/total': 969660224, 'tokens/trainable': 358351040, 'epoch': '0.8756'}
 29%|█████████████████████████▋                                                              | 511/1751 [8:33:29<20:39:03, 59.95s/it] 29%|█████████████████████████▋                                                              | 512/1751 [8:34:29<20:40:36, 60.08s/it]                                                                                                                                     {'loss': '0.53', 'grad_norm': '0.2021', 'learning_rate': '1.696e-05', 'ppl': '1.699', 'memory/max_active (GiB)': '70.44', 'memory/max_allocated (GiB)': '70.44', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '179.6', 'tokens/total': 971585472, 'tokens/trainable': 359044352, 'epoch': '0.8774'}
 29%|█████████████████████████▋                                                              | 512/1751 [8:34:29<20:40:36, 60.08s/it] 29%|█████████████████████████▊                                                              | 513/1751 [8:35:30<20:45:34, 60.37s/it]                                                                                                                                     {'loss': '0.5385', 'grad_norm': '0.2002', 'learning_rate': '1.695e-05', 'ppl': '1.713', 'memory/max_active (GiB)': '76.53', 'memory/max_allocated (GiB)': '76.53', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '59.08', 'tokens/total': 973486592, 'tokens/trainable': 359779264, 'epoch': '0.8791'}
 29%|█████████████████████████▊                                                              | 513/1751 [8:35:30<20:45:34, 60.37s/it] 29%|█████████████████████████▊                                                              | 514/1751 [8:36:30<20:38:46, 60.09s/it]                                                                                                                                     {'loss': '0.5287', 'grad_norm': '0.1982', 'learning_rate': '1.694e-05', 'ppl': '1.697', 'memory/max_active (GiB)': '76.15', 'memory/max_allocated (GiB)': '76.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '147.2', 'tokens/total': 975352384, 'tokens/trainable': 360467744, 'epoch': '0.8808'}
 29%|█████████████████████████▊                                                              | 514/1751 [8:36:30<20:38:46, 60.09s/it] 29%|█████████████████████████▉                                                              | 515/1751 [8:37:31<20:42:05, 60.30s/it]                                                                                                                                     {'loss': '0.5319', 'grad_norm': '0.2129', 'learning_rate': '1.692e-05', 'ppl': '1.702', 'memory/max_active (GiB)': '67.11', 'memory/max_allocated (GiB)': '67.11', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '68.16', 'tokens/total': 977279104, 'tokens/trainable': 361183232, 'epoch': '0.8825'}
 29%|█████████████████████████▉                                                              | 515/1751 [8:37:31<20:42:05, 60.30s/it] 29%|█████████████████████████▉                                                              | 516/1751 [8:38:29<20:29:03, 59.71s/it]                                                                                                                                     {'loss': '0.5516', 'grad_norm': '0.2002', 'learning_rate': '1.691e-05', 'ppl': '1.736', 'memory/max_active (GiB)': '78.38', 'memory/max_allocated (GiB)': '78.38', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '48.44', 'tokens/total': 979076032, 'tokens/trainable': 361848192, 'epoch': '0.8842'}
 29%|█████████████████████████▉                                                              | 516/1751 [8:38:29<20:29:03, 59.71s/it] 30%|█████████████████████████▉                                                              | 517/1751 [8:39:30<20:39:23, 60.26s/it]                                                                                                                                     {'loss': '0.5426', 'grad_norm': '0.1934', 'learning_rate': '1.69e-05', 'ppl': '1.72', 'memory/max_active (GiB)': '74.55', 'memory/max_allocated (GiB)': '74.55', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '107.2', 'tokens/total': 981050752, 'tokens/trainable': 362614560, 'epoch': '0.8859'}
 30%|█████████████████████████▉                                                              | 517/1751 [8:39:31<20:39:23, 60.26s/it] 30%|██████████████████████████                                                              | 518/1751 [8:40:30<20:34:25, 60.07s/it]                                                                                                                                     {'loss': '0.4881', 'grad_norm': '0.1807', 'learning_rate': '1.688e-05', 'ppl': '1.629', 'memory/max_active (GiB)': '75.95', 'memory/max_allocated (GiB)': '75.95', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '180.9', 'tokens/total': 982978240, 'tokens/trainable': 363331744, 'epoch': '0.8876'}
 30%|██████████████████████████                                                              | 518/1751 [8:40:30<20:34:25, 60.07s/it] 30%|██████████████████████████                                                              | 519/1751 [8:41:32<20:44:31, 60.61s/it]                                                                                                                                     {'loss': '0.5198', 'grad_norm': '0.1982', 'learning_rate': '1.687e-05', 'ppl': '1.682', 'memory/max_active (GiB)': '73.51', 'memory/max_allocated (GiB)': '73.51', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '260.9', 'tokens/total': 984946176, 'tokens/trainable': 364067168, 'epoch': '0.8894'}
 30%|██████████████████████████                                                              | 519/1751 [8:41:32<20:44:31, 60.61s/it] 30%|██████████████████████████▏                                                             | 520/1751 [8:42:32<20:40:23, 60.46s/it]                                                                                                                                     {'loss': '0.5254', 'grad_norm': '0.1953', 'learning_rate': '1.685e-05', 'ppl': '1.691', 'memory/max_active (GiB)': '77.53', 'memory/max_allocated (GiB)': '77.53', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '54.51', 'tokens/total': 986874944, 'tokens/trainable': 364804192, 'epoch': '0.8911'}
 30%|██████████████████████████▏                                                             | 520/1751 [8:42:32<20:40:23, 60.46s/it] 30%|██████████████████████████▏                                                             | 521/1751 [8:43:32<20:33:13, 60.16s/it]                                                                                                                                     {'loss': '0.5452', 'grad_norm': '0.2148', 'learning_rate': '1.684e-05', 'ppl': '1.725', 'memory/max_active (GiB)': '69.08', 'memory/max_allocated (GiB)': '69.08', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '84.4', 'tokens/total': 988731264, 'tokens/trainable': 365473888, 'epoch': '0.8928'}
 30%|██████████████████████████▏                                                             | 521/1751 [8:43:32<20:33:13, 60.16s/it] 30%|██████████████████████████▏                                                             | 522/1751 [8:44:33<20:37:38, 60.42s/it]                                                                                                                                     {'loss': '0.556', 'grad_norm': '0.1973', 'learning_rate': '1.683e-05', 'ppl': '1.744', 'memory/max_active (GiB)': '72.97', 'memory/max_allocated (GiB)': '72.97', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '28.91', 'tokens/total': 990669248, 'tokens/trainable': 366182624, 'epoch': '0.8945'}
 30%|██████████████████████████▏                                                             | 522/1751 [8:44:33<20:37:38, 60.42s/it] 30%|██████████████████████████▎                                                             | 523/1751 [8:45:33<20:36:32, 60.42s/it]                                                                                                                                     {'loss': '0.5275', 'grad_norm': '0.1973', 'learning_rate': '1.681e-05', 'ppl': '1.695', 'memory/max_active (GiB)': '71.37', 'memory/max_allocated (GiB)': '71.37', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '84.05', 'tokens/total': 992599296, 'tokens/trainable': 366884960, 'epoch': '0.8962'}
 30%|██████████████████████████▎                                                             | 523/1751 [8:45:33<20:36:32, 60.42s/it] 30%|██████████████████████████▎                                                             | 524/1751 [8:46:32<20:29:22, 60.12s/it]                                                                                                                                     {'loss': '0.536', 'grad_norm': '0.1914', 'learning_rate': '1.68e-05', 'ppl': '1.709', 'memory/max_active (GiB)': '76.1', 'memory/max_allocated (GiB)': '76.1', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '109.5', 'tokens/total': 994499584, 'tokens/trainable': 367580512, 'epoch': '0.8979'}
 30%|██████████████████████████▎                                                             | 524/1751 [8:46:32<20:29:22, 60.12s/it] 30%|██████████████████████████▍                                                             | 525/1751 [8:47:36<20:48:35, 61.11s/it]                                                                                                                                     {'loss': '0.5078', 'grad_norm': '0.1982', 'learning_rate': '1.679e-05', 'ppl': '1.662', 'memory/max_active (GiB)': '73.86', 'memory/max_allocated (GiB)': '73.86', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '28.22', 'tokens/total': 996548992, 'tokens/trainable': 368324480, 'epoch': '0.8996'}
 30%|██████████████████████████▍                                                             | 525/1751 [8:47:36<20:48:35, 61.11s/it] 30%|██████████████████████████▍                                                             | 526/1751 [8:48:35<20:37:51, 60.63s/it]                                                                                                                                     {'loss': '0.5435', 'grad_norm': '0.1924', 'learning_rate': '1.677e-05', 'ppl': '1.722', 'memory/max_active (GiB)': '67.06', 'memory/max_allocated (GiB)': '67.06', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '108.5', 'tokens/total': 998449472, 'tokens/trainable': 369011200, 'epoch': '0.9013'}
 30%|██████████████████████████▍                                                             | 526/1751 [8:48:35<20:37:51, 60.63s/it] 30%|██████████████████████████▍                                                             | 527/1751 [8:49:37<20:44:26, 61.00s/it]                                                                                                                                     {'loss': '0.5232', 'grad_norm': '0.1904', 'learning_rate': '1.676e-05', 'ppl': '1.687', 'memory/max_active (GiB)': '74.91', 'memory/max_allocated (GiB)': '74.91', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '39.89', 'tokens/total': 1000448320, 'tokens/trainable': 369758752, 'epoch': '0.9031'}
 30%|██████████████████████████▍                                                             | 527/1751 [8:49:37<20:44:26, 61.00s/it] 30%|██████████████████████████▌                                                             | 528/1751 [8:50:38<20:41:39, 60.92s/it]                                                                                                                                     {'loss': '0.5045', 'grad_norm': '0.1865', 'learning_rate': '1.674e-05', 'ppl': '1.656', 'memory/max_active (GiB)': '76.28', 'memory/max_allocated (GiB)': '76.28', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '28.34', 'tokens/total': 1002367488, 'tokens/trainable': 370457408, 'epoch': '0.9048'}
 30%|██████████████████████████▌                                                             | 528/1751 [8:50:38<20:41:39, 60.92s/it] 30%|██████████████████████████▌                                                             | 529/1751 [8:51:37<20:29:48, 60.38s/it]                                                                                                                                     {'loss': '0.5486', 'grad_norm': '0.2002', 'learning_rate': '1.673e-05', 'ppl': '1.731', 'memory/max_active (GiB)': '69.72', 'memory/max_allocated (GiB)': '69.72', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '92.97', 'tokens/total': 1004237120, 'tokens/trainable': 371150464, 'epoch': '0.9065'}
 30%|██████████████████████████▌                                                             | 529/1751 [8:51:37<20:29:48, 60.38s/it] 30%|██████████████████████████▋                                                             | 530/1751 [8:52:40<20:44:42, 61.17s/it]                                                                                                                                     {'loss': '0.4995', 'grad_norm': '0.1787', 'learning_rate': '1.672e-05', 'ppl': '1.648', 'memory/max_active (GiB)': '72.66', 'memory/max_allocated (GiB)': '72.66', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '81.57', 'tokens/total': 1006276096, 'tokens/trainable': 371866784, 'epoch': '0.9082'}
 30%|██████████████████████████▋                                                             | 530/1751 [8:52:40<20:44:42, 61.17s/it] 30%|██████████████████████████▋                                                             | 531/1751 [8:53:42<20:45:55, 61.27s/it]                                                                                                                                     {'loss': '0.5005', 'grad_norm': '0.1885', 'learning_rate': '1.67e-05', 'ppl': '1.649', 'memory/max_active (GiB)': '75.64', 'memory/max_allocated (GiB)': '75.64', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '78.26', 'tokens/total': 1008275008, 'tokens/trainable': 372607616, 'epoch': '0.9099'}
 30%|██████████████████████████▋                                                             | 531/1751 [8:53:42<20:45:55, 61.27s/it] 30%|██████████████████████████▋                                                             | 532/1751 [8:54:41<20:33:35, 60.72s/it]                                                                                                                                     {'loss': '0.5069', 'grad_norm': '0.1924', 'learning_rate': '1.669e-05', 'ppl': '1.66', 'memory/max_active (GiB)': '75.12', 'memory/max_allocated (GiB)': '75.12', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '54.39', 'tokens/total': 1010150336, 'tokens/trainable': 373309728, 'epoch': '0.9116'}
 30%|██████████████████████████▋                                                             | 532/1751 [8:54:41<20:33:35, 60.72s/it] 30%|██████████████████████████▊                                                             | 533/1751 [8:55:43<20:39:01, 61.04s/it]                                                                                                                                     {'loss': '0.5266', 'grad_norm': '0.1816', 'learning_rate': '1.667e-05', 'ppl': '1.693', 'memory/max_active (GiB)': '76.71', 'memory/max_allocated (GiB)': '76.71', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '33.53', 'tokens/total': 1012156480, 'tokens/trainable': 374043616, 'epoch': '0.9133'}
 30%|██████████████████████████▊                                                             | 533/1751 [8:55:43<20:39:01, 61.04s/it] 30%|██████████████████████████▊                                                             | 534/1751 [8:56:41<20:17:48, 60.04s/it]                                                                                                                                     {'loss': '0.5319', 'grad_norm': '0.2031', 'learning_rate': '1.666e-05', 'ppl': '1.702', 'memory/max_active (GiB)': '76', 'memory/max_allocated (GiB)': '76', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '25.04', 'tokens/total': 1013945088, 'tokens/trainable': 374711360, 'epoch': '0.9151'}
 30%|██████████████████████████▊                                                             | 534/1751 [8:56:41<20:17:48, 60.04s/it] 31%|██████████████████████████▉                                                             | 535/1751 [8:57:41<20:20:38, 60.23s/it]                                                                                                                                     {'loss': '0.5334', 'grad_norm': '0.1934', 'learning_rate': '1.665e-05', 'ppl': '1.705', 'memory/max_active (GiB)': '74.55', 'memory/max_allocated (GiB)': '74.55', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '52.38', 'tokens/total': 1015893184, 'tokens/trainable': 375423264, 'epoch': '0.9168'}
 31%|██████████████████████████▉                                                             | 535/1751 [8:57:41<20:20:38, 60.23s/it] 31%|██████████████████████████▉                                                             | 536/1751 [8:58:42<20:21:20, 60.31s/it]                                                                                                                                     {'loss': '0.5067', 'grad_norm': '0.1855', 'learning_rate': '1.663e-05', 'ppl': '1.66', 'memory/max_active (GiB)': '74.66', 'memory/max_allocated (GiB)': '74.66', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '166', 'tokens/total': 1017807808, 'tokens/trainable': 376141760, 'epoch': '0.9185'}
 31%|██████████████████████████▉                                                             | 536/1751 [8:58:42<20:21:20, 60.31s/it] 31%|██████████████████████████▉                                                             | 537/1751 [8:59:43<20:27:43, 60.68s/it]                                                                                                                                     {'loss': '0.5362', 'grad_norm': '0.1924', 'learning_rate': '1.662e-05', 'ppl': '1.709', 'memory/max_active (GiB)': '76.54', 'memory/max_allocated (GiB)': '76.54', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '147.8', 'tokens/total': 1019714624, 'tokens/trainable': 376847648, 'epoch': '0.9202'}
 31%|██████████████████████████▉                                                             | 537/1751 [8:59:43<20:27:43, 60.68s/it] 31%|███████████████████████████                                                             | 538/1751 [9:00:44<20:28:31, 60.77s/it]                                                                                                                                     {'loss': '0.491', 'grad_norm': '0.1943', 'learning_rate': '1.66e-05', 'ppl': '1.634', 'memory/max_active (GiB)': '74.84', 'memory/max_allocated (GiB)': '74.84', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '175.1', 'tokens/total': 1021669120, 'tokens/trainable': 377554496, 'epoch': '0.9219'}
 31%|███████████████████████████                                                             | 538/1751 [9:00:44<20:28:31, 60.77s/it] 31%|███████████████████████████                                                             | 539/1751 [9:01:43<20:13:33, 60.08s/it]                                                                                                                                     {'loss': '0.5461', 'grad_norm': '0.2168', 'learning_rate': '1.659e-05', 'ppl': '1.726', 'memory/max_active (GiB)': '72.31', 'memory/max_allocated (GiB)': '72.31', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '100.6', 'tokens/total': 1023523072, 'tokens/trainable': 378239200, 'epoch': '0.9236'}
 31%|███████████████████████████                                                             | 539/1751 [9:01:43<20:13:33, 60.08s/it] 31%|███████████████████████████▏                                                            | 540/1751 [9:02:42<20:11:09, 60.01s/it]                                                                                                                                     {'loss': '0.5599', 'grad_norm': '0.2227', 'learning_rate': '1.657e-05', 'ppl': '1.751', 'memory/max_active (GiB)': '74.89', 'memory/max_allocated (GiB)': '74.89', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '84.1', 'tokens/total': 1025399616, 'tokens/trainable': 378935808, 'epoch': '0.9253'}
 31%|███████████████████████████▏                                                            | 540/1751 [9:02:43<20:11:09, 60.01s/it] 31%|███████████████████████████▏                                                            | 541/1751 [9:03:43<20:12:40, 60.13s/it]                                                                                                                                     {'loss': '0.5022', 'grad_norm': '0.1982', 'learning_rate': '1.656e-05', 'ppl': '1.652', 'memory/max_active (GiB)': '73.4', 'memory/max_allocated (GiB)': '73.4', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '80.28', 'tokens/total': 1027301568, 'tokens/trainable': 379653472, 'epoch': '0.9271'}
 31%|███████████████████████████▏                                                            | 541/1751 [9:03:43<20:12:40, 60.13s/it] 31%|███████████████████████████▏                                                            | 542/1751 [9:04:43<20:12:18, 60.16s/it]                                                                                                                                     {'loss': '0.5385', 'grad_norm': '0.1973', 'learning_rate': '1.655e-05', 'ppl': '1.713', 'memory/max_active (GiB)': '76.32', 'memory/max_allocated (GiB)': '76.32', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '39.19', 'tokens/total': 1029241920, 'tokens/trainable': 380365376, 'epoch': '0.9288'}
 31%|███████████████████████████▏                                                            | 542/1751 [9:04:43<20:12:18, 60.16s/it] 31%|███████████████████████████▎                                                            | 543/1751 [9:05:42<20:02:28, 59.73s/it]                                                                                                                                     {'loss': '0.5259', 'grad_norm': '0.1982', 'learning_rate': '1.653e-05', 'ppl': '1.692', 'memory/max_active (GiB)': '73.17', 'memory/max_allocated (GiB)': '73.17', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '81.22', 'tokens/total': 1031078656, 'tokens/trainable': 381031328, 'epoch': '0.9305'}
 31%|███████████████████████████▎                                                            | 543/1751 [9:05:42<20:02:28, 59.73s/it] 31%|███████████████████████████▎                                                            | 544/1751 [9:06:43<20:07:47, 60.04s/it]                                                                                                                                     {'loss': '0.4848', 'grad_norm': '0.2168', 'learning_rate': '1.652e-05', 'ppl': '1.624', 'memory/max_active (GiB)': '73.11', 'memory/max_allocated (GiB)': '73.11', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '71.34', 'tokens/total': 1033030336, 'tokens/trainable': 381727872, 'epoch': '0.9322'}
 31%|███████████████████████████▎                                                            | 544/1751 [9:06:43<20:07:47, 60.04s/it] 31%|███████████████████████████▍                                                            | 545/1751 [9:07:42<20:03:59, 59.90s/it]                                                                                                                                     {'loss': '0.5439', 'grad_norm': '0.1953', 'learning_rate': '1.65e-05', 'ppl': '1.723', 'memory/max_active (GiB)': '73.49', 'memory/max_allocated (GiB)': '73.49', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '55.2', 'tokens/total': 1034930560, 'tokens/trainable': 382428640, 'epoch': '0.9339'}
 31%|███████████████████████████▍                                                            | 545/1751 [9:07:42<20:03:59, 59.90s/it] 31%|███████████████████████████▍                                                            | 546/1751 [9:08:44<20:11:59, 60.35s/it]                                                                                                                                     {'loss': '0.5404', 'grad_norm': '0.2002', 'learning_rate': '1.649e-05', 'ppl': '1.717', 'memory/max_active (GiB)': '76.59', 'memory/max_allocated (GiB)': '76.59', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '88.64', 'tokens/total': 1036905216, 'tokens/trainable': 383170528, 'epoch': '0.9356'}
 31%|███████████████████████████▍                                                            | 546/1751 [9:08:44<20:11:59, 60.35s/it] 31%|███████████████████████████▍                                                            | 547/1751 [9:09:44<20:12:01, 60.40s/it]                                                                                                                                     {'loss': '0.5174', 'grad_norm': '0.2002', 'learning_rate': '1.647e-05', 'ppl': '1.678', 'memory/max_active (GiB)': '73.6', 'memory/max_allocated (GiB)': '73.6', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '61.2', 'tokens/total': 1038830336, 'tokens/trainable': 383905760, 'epoch': '0.9373'}
 31%|███████████████████████████▍                                                            | 547/1751 [9:09:44<20:12:01, 60.40s/it] 31%|███████████████████████████▌                                                            | 548/1751 [9:10:45<20:14:01, 60.55s/it]                                                                                                                                     {'loss': '0.5031', 'grad_norm': '0.1963', 'learning_rate': '1.646e-05', 'ppl': '1.654', 'memory/max_active (GiB)': '75.73', 'memory/max_allocated (GiB)': '75.73', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '50.58', 'tokens/total': 1040752128, 'tokens/trainable': 384619936, 'epoch': '0.939'}
 31%|███████████████████████████▌                                                            | 548/1751 [9:10:45<20:14:01, 60.55s/it] 31%|███████████████████████████▌                                                            | 549/1751 [9:11:45<20:12:20, 60.52s/it]                                                                                                                                     {'loss': '0.5394', 'grad_norm': '0.1914', 'learning_rate': '1.645e-05', 'ppl': '1.715', 'memory/max_active (GiB)': '73.46', 'memory/max_allocated (GiB)': '73.46', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '48.19', 'tokens/total': 1042663232, 'tokens/trainable': 385328672, 'epoch': '0.9408'}
 31%|███████████████████████████▌                                                            | 549/1751 [9:11:45<20:12:20, 60.52s/it] 31%|███████████████████████████▋                                                            | 550/1751 [9:12:48<20:21:32, 61.03s/it]                                                                                                                                     {'loss': '0.5367', 'grad_norm': '0.1943', 'learning_rate': '1.643e-05', 'ppl': '1.71', 'memory/max_active (GiB)': '72.82', 'memory/max_allocated (GiB)': '72.82', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '168.5', 'tokens/total': 1044624960, 'tokens/trainable': 386046240, 'epoch': '0.9425'}
 31%|███████████████████████████▋                                                            | 550/1751 [9:12:48<20:21:32, 61.03s/it] 31%|███████████████████████████▋                                                            | 551/1751 [9:13:49<20:22:19, 61.12s/it]                                                                                                                                     {'loss': '0.5217', 'grad_norm': '0.1895', 'learning_rate': '1.642e-05', 'ppl': '1.685', 'memory/max_active (GiB)': '77.49', 'memory/max_allocated (GiB)': '77.49', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '196.3', 'tokens/total': 1046610240, 'tokens/trainable': 386761984, 'epoch': '0.9442'}
 31%|███████████████████████████▋                                                            | 551/1751 [9:13:49<20:22:19, 61.12s/it] 32%|███████████████████████████▋                                                            | 552/1751 [9:14:50<20:17:50, 60.94s/it]                                                                                                                                     {'loss': '0.5174', 'grad_norm': '0.1953', 'learning_rate': '1.64e-05', 'ppl': '1.678', 'memory/max_active (GiB)': '74.88', 'memory/max_allocated (GiB)': '74.88', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '102.8', 'tokens/total': 1048498752, 'tokens/trainable': 387466784, 'epoch': '0.9459'}
 32%|███████████████████████████▋                                                            | 552/1751 [9:14:50<20:17:50, 60.94s/it] 32%|███████████████████████████▊                                                            | 553/1751 [9:15:52<20:27:07, 61.46s/it]                                                                                                                                     {'loss': '0.5047', 'grad_norm': '0.2002', 'learning_rate': '1.639e-05', 'ppl': '1.657', 'memory/max_active (GiB)': '75.37', 'memory/max_allocated (GiB)': '75.37', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '60.29', 'tokens/total': 1050482048, 'tokens/trainable': 388216608, 'epoch': '0.9476'}
 32%|███████████████████████████▊                                                            | 553/1751 [9:15:52<20:27:07, 61.46s/it] 32%|███████████████████████████▊                                                            | 554/1751 [9:16:52<20:17:46, 61.04s/it]                                                                                                                                     {'loss': '0.5666', 'grad_norm': '0.2012', 'learning_rate': '1.637e-05', 'ppl': '1.762', 'memory/max_active (GiB)': '75.14', 'memory/max_allocated (GiB)': '75.14', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '14.78', 'tokens/total': 1052384320, 'tokens/trainable': 388918752, 'epoch': '0.9493'}
 32%|███████████████████████████▊                                                            | 554/1751 [9:16:52<20:17:46, 61.04s/it] 32%|███████████████████████████▉                                                            | 555/1751 [9:17:50<19:55:25, 59.97s/it]                                                                                                                                     {'loss': '0.5743', 'grad_norm': '0.2178', 'learning_rate': '1.636e-05', 'ppl': '1.776', 'memory/max_active (GiB)': '70.95', 'memory/max_allocated (GiB)': '70.95', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '69.81', 'tokens/total': 1054166208, 'tokens/trainable': 389575456, 'epoch': '0.951'}
 32%|███████████████████████████▉                                                            | 555/1751 [9:17:50<19:55:25, 59.97s/it] 32%|███████████████████████████▉                                                            | 556/1751 [9:18:52<20:05:40, 60.54s/it]                                                                                                                                     {'loss': '0.4923', 'grad_norm': '0.208', 'learning_rate': '1.634e-05', 'ppl': '1.636', 'memory/max_active (GiB)': '74.73', 'memory/max_allocated (GiB)': '74.73', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '112.8', 'tokens/total': 1056135488, 'tokens/trainable': 390313088, 'epoch': '0.9528'}
 32%|███████████████████████████▉                                                            | 556/1751 [9:18:52<20:05:40, 60.54s/it] 32%|███████████████████████████▉                                                            | 557/1751 [9:19:53<20:07:42, 60.69s/it]                                                                                                                                     {'loss': '0.5011', 'grad_norm': '0.1895', 'learning_rate': '1.633e-05', 'ppl': '1.651', 'memory/max_active (GiB)': '69.96', 'memory/max_allocated (GiB)': '69.96', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '69.26', 'tokens/total': 1058094400, 'tokens/trainable': 390984672, 'epoch': '0.9545'}
 32%|███████████████████████████▉                                                            | 557/1751 [9:19:53<20:07:42, 60.69s/it] 32%|████████████████████████████                                                            | 558/1751 [9:20:53<20:03:36, 60.53s/it]                                                                                                                                     {'loss': '0.4948', 'grad_norm': '0.209', 'learning_rate': '1.631e-05', 'ppl': '1.64', 'memory/max_active (GiB)': '74.63', 'memory/max_allocated (GiB)': '74.63', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '188.4', 'tokens/total': 1059995904, 'tokens/trainable': 391684672, 'epoch': '0.9562'}
 32%|████████████████████████████                                                            | 558/1751 [9:20:53<20:03:36, 60.53s/it] 32%|████████████████████████████                                                            | 559/1751 [9:21:50<19:42:19, 59.51s/it]                                                                                                                                     {'loss': '0.5366', 'grad_norm': '0.2129', 'learning_rate': '1.63e-05', 'ppl': '1.71', 'memory/max_active (GiB)': '75.87', 'memory/max_allocated (GiB)': '75.87', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '58.05', 'tokens/total': 1061768448, 'tokens/trainable': 392334400, 'epoch': '0.9579'}
 32%|████████████████████████████                                                            | 559/1751 [9:21:50<19:42:19, 59.51s/it] 32%|████████████████████████████▏                                                           | 560/1751 [9:22:51<19:50:22, 59.97s/it]                                                                                                                                     {'loss': '0.5057', 'grad_norm': '0.1973', 'learning_rate': '1.629e-05', 'ppl': '1.658', 'memory/max_active (GiB)': '74.48', 'memory/max_allocated (GiB)': '74.48', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '121.4', 'tokens/total': 1063704768, 'tokens/trainable': 393046784, 'epoch': '0.9596'}
 32%|████████████████████████████▏                                                           | 560/1751 [9:22:51<19:50:22, 59.97s/it] 32%|████████████████████████████▏                                                           | 561/1751 [9:23:50<19:46:17, 59.81s/it]                                                                                                                                     {'loss': '0.5563', 'grad_norm': '0.21', 'learning_rate': '1.627e-05', 'ppl': '1.744', 'memory/max_active (GiB)': '76.23', 'memory/max_allocated (GiB)': '76.23', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '60.92', 'tokens/total': 1065641920, 'tokens/trainable': 393760064, 'epoch': '0.9613'}
 32%|████████████████████████████▏                                                           | 561/1751 [9:23:50<19:46:17, 59.81s/it] 32%|████████████████████████████▏                                                           | 562/1751 [9:24:50<19:41:04, 59.60s/it]                                                                                                                                     {'loss': '0.5488', 'grad_norm': '0.21', 'learning_rate': '1.626e-05', 'ppl': '1.731', 'memory/max_active (GiB)': '65.14', 'memory/max_allocated (GiB)': '65.14', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '47.84', 'tokens/total': 1067547392, 'tokens/trainable': 394472960, 'epoch': '0.963'}
 32%|████████████████████████████▏                                                           | 562/1751 [9:24:50<19:41:04, 59.60s/it] 32%|████████████████████████████▎                                                           | 563/1751 [9:25:49<19:38:53, 59.54s/it]                                                                                                                                     {'loss': '0.5376', 'grad_norm': '0.1982', 'learning_rate': '1.624e-05', 'ppl': '1.712', 'memory/max_active (GiB)': '71.55', 'memory/max_allocated (GiB)': '71.55', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '25.59', 'tokens/total': 1069431680, 'tokens/trainable': 395161152, 'epoch': '0.9648'}
 32%|████████████████████████████▎                                                           | 563/1751 [9:25:49<19:38:53, 59.54s/it] 32%|████████████████████████████▎                                                           | 564/1751 [9:26:47<19:32:07, 59.25s/it]                                                                                                                                     {'loss': '0.5868', 'grad_norm': '0.2217', 'learning_rate': '1.623e-05', 'ppl': '1.798', 'memory/max_active (GiB)': '69.85', 'memory/max_allocated (GiB)': '69.85', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '94.6', 'tokens/total': 1071231424, 'tokens/trainable': 395841504, 'epoch': '0.9665'}
 32%|████████████████████████████▎                                                           | 564/1751 [9:26:48<19:32:07, 59.25s/it] 32%|████████████████████████████▍                                                           | 565/1751 [9:27:48<19:37:37, 59.58s/it]                                                                                                                                     {'loss': '0.5426', 'grad_norm': '0.207', 'learning_rate': '1.621e-05', 'ppl': '1.72', 'memory/max_active (GiB)': '73.07', 'memory/max_allocated (GiB)': '73.07', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '125.9', 'tokens/total': 1073167808, 'tokens/trainable': 396544736, 'epoch': '0.9682'}
 32%|████████████████████████████▍                                                           | 565/1751 [9:27:48<19:37:37, 59.58s/it] 32%|████████████████████████████▍                                                           | 566/1751 [9:28:48<19:38:27, 59.67s/it]                                                                                                                                     {'loss': '0.5373', 'grad_norm': '0.1875', 'learning_rate': '1.62e-05', 'ppl': '1.711', 'memory/max_active (GiB)': '72.15', 'memory/max_allocated (GiB)': '72.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '100.8', 'tokens/total': 1075074048, 'tokens/trainable': 397271712, 'epoch': '0.9699'}
 32%|████████████████████████████▍                                                           | 566/1751 [9:28:48<19:38:27, 59.67s/it] 32%|████████████████████████████▍                                                           | 567/1751 [9:29:47<19:37:20, 59.66s/it]                                                                                                                                     {'loss': '0.5043', 'grad_norm': '0.1973', 'learning_rate': '1.618e-05', 'ppl': '1.656', 'memory/max_active (GiB)': '75.51', 'memory/max_allocated (GiB)': '75.51', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '78.86', 'tokens/total': 1077010944, 'tokens/trainable': 397967904, 'epoch': '0.9716'}
 32%|████████████████████████████▍                                                           | 567/1751 [9:29:47<19:37:20, 59.66s/it] 32%|████████████████████████████▌                                                           | 568/1751 [9:30:50<19:52:59, 60.51s/it]                                                                                                                                     {'loss': '0.5213', 'grad_norm': '0.1973', 'learning_rate': '1.617e-05', 'ppl': '1.684', 'memory/max_active (GiB)': '76.89', 'memory/max_allocated (GiB)': '76.89', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '80.35', 'tokens/total': 1079041536, 'tokens/trainable': 398698528, 'epoch': '0.9733'}
 32%|████████████████████████████▌                                                           | 568/1751 [9:30:50<19:52:59, 60.51s/it] 32%|████████████████████████████▌                                                           | 569/1751 [9:31:53<20:05:38, 61.20s/it]                                                                                                                                     {'loss': '0.5124', 'grad_norm': '0.2002', 'learning_rate': '1.615e-05', 'ppl': '1.669', 'memory/max_active (GiB)': '74.41', 'memory/max_allocated (GiB)': '74.41', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '47.73', 'tokens/total': 1081044864, 'tokens/trainable': 399434592, 'epoch': '0.975'}
 32%|████████████████████████████▌                                                           | 569/1751 [9:31:53<20:05:38, 61.20s/it] 33%|████████████████████████████▋                                                           | 570/1751 [9:32:52<19:50:41, 60.49s/it]                                                                                                                                     {'loss': '0.5768', 'grad_norm': '0.2217', 'learning_rate': '1.614e-05', 'ppl': '1.78', 'memory/max_active (GiB)': '73.23', 'memory/max_allocated (GiB)': '73.23', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '106.9', 'tokens/total': 1082857472, 'tokens/trainable': 400116352, 'epoch': '0.9767'}
 33%|████████████████████████████▋                                                           | 570/1751 [9:32:52<19:50:41, 60.49s/it] 33%|████████████████████████████▋                                                           | 571/1751 [9:33:52<19:52:25, 60.63s/it]                                                                                                                                     {'loss': '0.5143', 'grad_norm': '0.2021', 'learning_rate': '1.612e-05', 'ppl': '1.672', 'memory/max_active (GiB)': '75.56', 'memory/max_allocated (GiB)': '75.56', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '82.63', 'tokens/total': 1084807040, 'tokens/trainable': 400855712, 'epoch': '0.9785'}
 33%|████████████████████████████▋                                                           | 571/1751 [9:33:52<19:52:25, 60.63s/it] 33%|████████████████████████████▋                                                           | 572/1751 [9:34:52<19:45:55, 60.35s/it]                                                                                                                                     {'loss': '0.5164', 'grad_norm': '0.1865', 'learning_rate': '1.611e-05', 'ppl': '1.676', 'memory/max_active (GiB)': '72.94', 'memory/max_allocated (GiB)': '72.94', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '104.3', 'tokens/total': 1086652928, 'tokens/trainable': 401564416, 'epoch': '0.9802'}
 33%|████████████████████████████▋                                                           | 572/1751 [9:34:52<19:45:55, 60.35s/it] 33%|████████████████████████████▊                                                           | 573/1751 [9:35:54<19:50:59, 60.66s/it]                                                                                                                                     {'loss': '0.5009', 'grad_norm': '0.1787', 'learning_rate': '1.609e-05', 'ppl': '1.65', 'memory/max_active (GiB)': '76.28', 'memory/max_allocated (GiB)': '76.28', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '86.07', 'tokens/total': 1088589312, 'tokens/trainable': 402309280, 'epoch': '0.9819'}
 33%|████████████████████████████▊                                                           | 573/1751 [9:35:54<19:50:59, 60.66s/it] 33%|████████████████████████████▊                                                           | 574/1751 [9:36:52<19:39:25, 60.12s/it]                                                                                                                                     {'loss': '0.551', 'grad_norm': '0.2061', 'learning_rate': '1.608e-05', 'ppl': '1.735', 'memory/max_active (GiB)': '74.28', 'memory/max_allocated (GiB)': '74.28', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '134.4', 'tokens/total': 1090389120, 'tokens/trainable': 402978528, 'epoch': '0.9836'}
 33%|████████████████████████████▊                                                           | 574/1751 [9:36:52<19:39:25, 60.12s/it] 33%|████████████████████████████▉                                                           | 575/1751 [9:37:54<19:46:14, 60.52s/it]                                                                                                                                     {'loss': '0.5103', 'grad_norm': '0.1953', 'learning_rate': '1.606e-05', 'ppl': '1.666', 'memory/max_active (GiB)': '73.13', 'memory/max_allocated (GiB)': '73.13', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '133.5', 'tokens/total': 1092344064, 'tokens/trainable': 403680992, 'epoch': '0.9853'}
 33%|████████████████████████████▉                                                           | 575/1751 [9:37:54<19:46:14, 60.52s/it] 33%|████████████████████████████▉                                                           | 576/1751 [9:38:52<19:33:51, 59.94s/it]                                                                                                                                     {'loss': '0.505', 'grad_norm': '0.1924', 'learning_rate': '1.605e-05', 'ppl': '1.657', 'memory/max_active (GiB)': '74.54', 'memory/max_allocated (GiB)': '74.54', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '90.31', 'tokens/total': 1094187520, 'tokens/trainable': 404342464, 'epoch': '0.987'}
 33%|████████████████████████████▉                                                           | 576/1751 [9:38:52<19:33:51, 59.94s/it] 33%|████████████████████████████▉                                                           | 577/1751 [9:39:51<19:22:26, 59.41s/it]                                                                                                                                     {'loss': '0.529', 'grad_norm': '0.21', 'learning_rate': '1.603e-05', 'ppl': '1.697', 'memory/max_active (GiB)': '72.68', 'memory/max_allocated (GiB)': '72.68', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '90.64', 'tokens/total': 1096023680, 'tokens/trainable': 405006496, 'epoch': '0.9887'}
 33%|████████████████████████████▉                                                           | 577/1751 [9:39:51<19:22:26, 59.41s/it] 33%|█████████████████████████████                                                           | 578/1751 [9:40:51<19:25:54, 59.64s/it]                                                                                                                                     {'loss': '0.5331', 'grad_norm': '0.1953', 'learning_rate': '1.602e-05', 'ppl': '1.704', 'memory/max_active (GiB)': '73.15', 'memory/max_allocated (GiB)': '73.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '70.75', 'tokens/total': 1097965568, 'tokens/trainable': 405723424, 'epoch': '0.9905'}
 33%|█████████████████████████████                                                           | 578/1751 [9:40:51<19:25:54, 59.64s/it] 33%|█████████████████████████████                                                           | 579/1751 [9:41:51<19:30:30, 59.92s/it]                                                                                                                                     {'loss': '0.4867', 'grad_norm': '0.1885', 'learning_rate': '1.6e-05', 'ppl': '1.627', 'memory/max_active (GiB)': '73.61', 'memory/max_allocated (GiB)': '73.61', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '163.6', 'tokens/total': 1099875712, 'tokens/trainable': 406413120, 'epoch': '0.9922'}
 33%|█████████████████████████████                                                           | 579/1751 [9:41:51<19:30:30, 59.92s/it] 33%|█████████████████████████████▏                                                          | 580/1751 [9:42:52<19:33:00, 60.10s/it]                                                                                                                                     {'loss': '0.5134', 'grad_norm': '0.1924', 'learning_rate': '1.599e-05', 'ppl': '1.671', 'memory/max_active (GiB)': '77.3', 'memory/max_allocated (GiB)': '77.3', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '108.4', 'tokens/total': 1101806592, 'tokens/trainable': 407127744, 'epoch': '0.9939'}
 33%|█████████████████████████████▏                                                          | 580/1751 [9:42:52<19:33:00, 60.10s/it] 33%|█████████████████████████████▏                                                          | 581/1751 [9:43:52<19:29:09, 59.96s/it]                                                                                                                                     {'loss': '0.5255', 'grad_norm': '0.2041', 'learning_rate': '1.597e-05', 'ppl': '1.691', 'memory/max_active (GiB)': '76.61', 'memory/max_allocated (GiB)': '76.61', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '97.65', 'tokens/total': 1103687552, 'tokens/trainable': 407821472, 'epoch': '0.9956'}
 33%|█████████████████████████████▏                                                          | 581/1751 [9:43:52<19:29:09, 59.96s/it] 33%|█████████████████████████████▏                                                          | 582/1751 [9:44:51<19:27:32, 59.93s/it]                                                                                                                                     {'loss': '0.5413', 'grad_norm': '0.2051', 'learning_rate': '1.596e-05', 'ppl': '1.718', 'memory/max_active (GiB)': '75.41', 'memory/max_allocated (GiB)': '75.41', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '42.53', 'tokens/total': 1105579520, 'tokens/trainable': 408531776, 'epoch': '0.9973'}
 33%|█████████████████████████████▏                                                          | 582/1751 [9:44:51<19:27:32, 59.93s/it] 33%|█████████████████████████████▎                                                          | 583/1751 [9:45:50<19:20:35, 59.62s/it]                                                                                                                                     {'loss': '0.522', 'grad_norm': '0.2344', 'learning_rate': '1.594e-05', 'ppl': '1.685', 'memory/max_active (GiB)': '73.59', 'memory/max_allocated (GiB)': '73.59', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '114.6', 'tokens/total': 1107424768, 'tokens/trainable': 409207840, 'epoch': '0.999'}
 33%|█████████████████████████████▎                                                          | 583/1751 [9:45:50<19:20:35, 59.62s/it] 33%|█████████████████████████████▎                                                          | 584/1751 [9:46:26<16:59:14, 52.40s/it]                                                                                                                                     {'loss': '0.5369', 'grad_norm': '0.2236', 'learning_rate': '1.593e-05', 'ppl': '1.711', 'memory/max_active (GiB)': '74.69', 'memory/max_allocated (GiB)': '74.69', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '241.8', 'tokens/total': 1108529280, 'tokens/trainable': 409623328, 'epoch': '1'}
 33%|█████████████████████████████▎                                                          | 584/1751 [9:46:26<16:59:14, 52.40s/it][2026-02-04 13:09:53,570] [INFO] [axolotl.core.trainers.base._save:721] [PID:23602] Saving model checkpoint to ./outputs/checkpoint-584

Writing model shards:   0%|                                                                                    | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|████████████████████████████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.64s/it][AWriting model shards: 100%|████████████████████████████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.64s/it]
 33%|█████████████████████████████▍                                                          | 585/1751 [9:47:59<20:54:32, 64.56s/it]                                                                                                                                     {'loss': '0.5421', 'grad_norm': '0.2012', 'learning_rate': '1.591e-05', 'ppl': '1.72', 'memory/max_active (GiB)': '68.93', 'memory/max_allocated (GiB)': '68.93', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '122.6', 'tokens/total': 1110341504, 'tokens/trainable': 410315616, 'epoch': '1.002'}
 33%|█████████████████████████████▍                                                          | 585/1751 [9:47:59<20:54:32, 64.56s/it] 33%|█████████████████████████████▍                                                          | 586/1751 [9:49:00<20:35:33, 63.63s/it]                                                                                                                                     {'loss': '0.4983', 'grad_norm': '0.1777', 'learning_rate': '1.59e-05', 'ppl': '1.646', 'memory/max_active (GiB)': '71.78', 'memory/max_allocated (GiB)': '71.78', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '87.2', 'tokens/total': 1112302592, 'tokens/trainable': 411042976, 'epoch': '1.003'}
 33%|█████████████████████████████▍                                                          | 586/1751 [9:49:00<20:35:33, 63.63s/it] 34%|█████████████████████████████▌                                                          | 587/1751 [9:50:01<20:16:30, 62.71s/it]                                                                                                                                     {'loss': '0.4873', 'grad_norm': '0.1787', 'learning_rate': '1.588e-05', 'ppl': '1.628', 'memory/max_active (GiB)': '75.59', 'memory/max_allocated (GiB)': '75.59', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '149.4', 'tokens/total': 1114216064, 'tokens/trainable': 411752480, 'epoch': '1.005'}
 34%|█████████████████████████████▌                                                          | 587/1751 [9:50:01<20:16:30, 62.71s/it] 34%|█████████████████████████████▌                                                          | 588/1751 [9:51:02<20:07:10, 62.28s/it]                                                                                                                                     {'loss': '0.5433', 'grad_norm': '0.2012', 'learning_rate': '1.587e-05', 'ppl': '1.722', 'memory/max_active (GiB)': '77.54', 'memory/max_allocated (GiB)': '77.54', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '184.9', 'tokens/total': 1116178304, 'tokens/trainable': 412510592, 'epoch': '1.007'}
 34%|█████████████████████████████▌                                                          | 588/1751 [9:51:02<20:07:10, 62.28s/it] 34%|█████████████████████████████▌                                                          | 589/1751 [9:52:01<19:47:30, 61.32s/it]                                                                                                                                     {'loss': '0.5422', 'grad_norm': '0.2002', 'learning_rate': '1.585e-05', 'ppl': '1.72', 'memory/max_active (GiB)': '74.06', 'memory/max_allocated (GiB)': '74.06', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '92.44', 'tokens/total': 1118037376, 'tokens/trainable': 413227392, 'epoch': '1.009'}
 34%|█████████████████████████████▌                                                          | 589/1751 [9:52:01<19:47:30, 61.32s/it] 34%|█████████████████████████████▋                                                          | 590/1751 [9:52:59<19:27:41, 60.35s/it]                                                                                                                                     {'loss': '0.5262', 'grad_norm': '0.1846', 'learning_rate': '1.584e-05', 'ppl': '1.693', 'memory/max_active (GiB)': '74.93', 'memory/max_allocated (GiB)': '74.93', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '91.89', 'tokens/total': 1119886336, 'tokens/trainable': 413918240, 'epoch': '1.01'}
 34%|█████████████████████████████▋                                                          | 590/1751 [9:52:59<19:27:41, 60.35s/it] 34%|█████████████████████████████▋                                                          | 591/1751 [9:53:58<19:17:17, 59.86s/it]                                                                                                                                     {'loss': '0.5462', 'grad_norm': '0.1953', 'learning_rate': '1.582e-05', 'ppl': '1.727', 'memory/max_active (GiB)': '68.31', 'memory/max_allocated (GiB)': '68.31', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '104.7', 'tokens/total': 1121755136, 'tokens/trainable': 414606144, 'epoch': '1.012'}
 34%|█████████████████████████████▋                                                          | 591/1751 [9:53:58<19:17:17, 59.86s/it] 34%|█████████████████████████████▊                                                          | 592/1751 [9:55:01<19:32:14, 60.69s/it]                                                                                                                                     {'loss': '0.5072', 'grad_norm': '0.1895', 'learning_rate': '1.58e-05', 'ppl': '1.661', 'memory/max_active (GiB)': '73.02', 'memory/max_allocated (GiB)': '73.02', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '19.74', 'tokens/total': 1123729920, 'tokens/trainable': 415376992, 'epoch': '1.014'}
 34%|█████████████████████████████▊                                                          | 592/1751 [9:55:01<19:32:14, 60.69s/it] 34%|█████████████████████████████▊                                                          | 593/1751 [9:55:59<19:19:59, 60.10s/it]                                                                                                                                     {'loss': '0.5231', 'grad_norm': '0.1953', 'learning_rate': '1.579e-05', 'ppl': '1.687', 'memory/max_active (GiB)': '68.18', 'memory/max_allocated (GiB)': '68.18', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '187.7', 'tokens/total': 1125555712, 'tokens/trainable': 416060704, 'epoch': '1.015'}
 34%|█████████████████████████████▊                                                          | 593/1751 [9:55:59<19:19:59, 60.10s/it] 34%|█████████████████████████████▊                                                          | 594/1751 [9:56:59<19:19:04, 60.11s/it]                                                                                                                                     {'loss': '0.5288', 'grad_norm': '0.1777', 'learning_rate': '1.577e-05', 'ppl': '1.697', 'memory/max_active (GiB)': '75.67', 'memory/max_allocated (GiB)': '75.67', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '187.1', 'tokens/total': 1127457664, 'tokens/trainable': 416772864, 'epoch': '1.017'}
 34%|█████████████████████████████▊                                                          | 594/1751 [9:56:59<19:19:04, 60.11s/it] 34%|█████████████████████████████▉                                                          | 595/1751 [9:58:00<19:19:32, 60.18s/it]                                                                                                                                     {'loss': '0.5516', 'grad_norm': '0.208', 'learning_rate': '1.576e-05', 'ppl': '1.736', 'memory/max_active (GiB)': '71.46', 'memory/max_allocated (GiB)': '71.46', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '79.11', 'tokens/total': 1129329536, 'tokens/trainable': 417458720, 'epoch': '1.019'}
 34%|█████████████████████████████▉                                                          | 595/1751 [9:58:00<19:19:32, 60.18s/it] 34%|█████████████████████████████▉                                                          | 596/1751 [9:59:01<19:26:26, 60.59s/it]                                                                                                                                     {'loss': '0.5294', 'grad_norm': '0.1875', 'learning_rate': '1.574e-05', 'ppl': '1.698', 'memory/max_active (GiB)': '75.73', 'memory/max_allocated (GiB)': '75.73', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '113.6', 'tokens/total': 1131287808, 'tokens/trainable': 418177216, 'epoch': '1.021'}
 34%|█████████████████████████████▉                                                          | 596/1751 [9:59:01<19:26:26, 60.59s/it] 34%|█████████████████████████████▋                                                         | 597/1751 [10:00:04<19:37:04, 61.20s/it]                                                                                                                                     {'loss': '0.4897', 'grad_norm': '0.1924', 'learning_rate': '1.573e-05', 'ppl': '1.632', 'memory/max_active (GiB)': '76.82', 'memory/max_allocated (GiB)': '76.82', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '20.58', 'tokens/total': 1133300224, 'tokens/trainable': 418905088, 'epoch': '1.022'}
 34%|█████████████████████████████▋                                                         | 597/1751 [10:00:04<19:37:04, 61.20s/it] 34%|█████████████████████████████▋                                                         | 598/1751 [10:01:04<19:31:40, 60.97s/it]                                                                                                                                     {'loss': '0.4811', 'grad_norm': '0.1924', 'learning_rate': '1.571e-05', 'ppl': '1.618', 'memory/max_active (GiB)': '70.64', 'memory/max_allocated (GiB)': '70.64', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '93.55', 'tokens/total': 1135231872, 'tokens/trainable': 419581760, 'epoch': '1.024'}
 34%|█████████████████████████████▋                                                         | 598/1751 [10:01:04<19:31:40, 60.97s/it] 34%|█████████████████████████████▊                                                         | 599/1751 [10:02:01<19:07:18, 59.76s/it]                                                                                                                                     {'loss': '0.5325', 'grad_norm': '0.1924', 'learning_rate': '1.57e-05', 'ppl': '1.703', 'memory/max_active (GiB)': '76.49', 'memory/max_allocated (GiB)': '76.49', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '78.42', 'tokens/total': 1137004544, 'tokens/trainable': 420246848, 'epoch': '1.026'}
 34%|█████████████████████████████▊                                                         | 599/1751 [10:02:01<19:07:18, 59.76s/it] 34%|█████████████████████████████▊                                                         | 600/1751 [10:03:06<19:35:59, 61.30s/it]                                                                                                                                     {'loss': '0.5134', 'grad_norm': '0.1777', 'learning_rate': '1.568e-05', 'ppl': '1.671', 'memory/max_active (GiB)': '75.55', 'memory/max_allocated (GiB)': '75.55', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '15.79', 'tokens/total': 1139078656, 'tokens/trainable': 421011392, 'epoch': '1.027'}
 34%|█████████████████████████████▊                                                         | 600/1751 [10:03:06<19:35:59, 61.30s/it] 34%|█████████████████████████████▊                                                         | 601/1751 [10:04:04<19:13:13, 60.17s/it]                                                                                                                                     {'loss': '0.5299', 'grad_norm': '0.1953', 'learning_rate': '1.567e-05', 'ppl': '1.699', 'memory/max_active (GiB)': '70.63', 'memory/max_allocated (GiB)': '70.63', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '77.56', 'tokens/total': 1140869632, 'tokens/trainable': 421677376, 'epoch': '1.029'}
 34%|█████████████████████████████▊                                                         | 601/1751 [10:04:04<19:13:13, 60.17s/it] 34%|█████████████████████████████▉                                                         | 602/1751 [10:05:03<19:09:37, 60.03s/it]                                                                                                                                     {'loss': '0.5772', 'grad_norm': '0.2061', 'learning_rate': '1.565e-05', 'ppl': '1.781', 'memory/max_active (GiB)': '76.93', 'memory/max_allocated (GiB)': '76.93', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '55.3', 'tokens/total': 1142736384, 'tokens/trainable': 422383264, 'epoch': '1.031'}
 34%|█████████████████████████████▉                                                         | 602/1751 [10:05:03<19:09:37, 60.03s/it] 34%|█████████████████████████████▉                                                         | 603/1751 [10:06:05<19:16:01, 60.42s/it]                                                                                                                                     {'loss': '0.551', 'grad_norm': '0.2041', 'learning_rate': '1.563e-05', 'ppl': '1.735', 'memory/max_active (GiB)': '73.09', 'memory/max_allocated (GiB)': '73.09', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '163.7', 'tokens/total': 1144692224, 'tokens/trainable': 423103808, 'epoch': '1.033'}
 34%|█████████████████████████████▉                                                         | 603/1751 [10:06:05<19:16:01, 60.42s/it] 34%|██████████████████████████████                                                         | 604/1751 [10:07:04<19:07:58, 60.05s/it]                                                                                                                                     {'loss': '0.514', 'grad_norm': '0.1826', 'learning_rate': '1.562e-05', 'ppl': '1.672', 'memory/max_active (GiB)': '74.04', 'memory/max_allocated (GiB)': '74.04', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '99.25', 'tokens/total': 1146555264, 'tokens/trainable': 423788672, 'epoch': '1.034'}
 34%|██████████████████████████████                                                         | 604/1751 [10:07:04<19:07:58, 60.05s/it] 35%|██████████████████████████████                                                         | 605/1751 [10:08:04<19:09:24, 60.18s/it]                                                                                                                                     {'loss': '0.5254', 'grad_norm': '0.1895', 'learning_rate': '1.56e-05', 'ppl': '1.691', 'memory/max_active (GiB)': '76.62', 'memory/max_allocated (GiB)': '76.62', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '37.86', 'tokens/total': 1148503168, 'tokens/trainable': 424508224, 'epoch': '1.036'}
 35%|██████████████████████████████                                                         | 605/1751 [10:08:04<19:09:24, 60.18s/it] 35%|██████████████████████████████                                                         | 606/1751 [10:09:05<19:10:07, 60.27s/it]                                                                                                                                     {'loss': '0.5187', 'grad_norm': '0.21', 'learning_rate': '1.559e-05', 'ppl': '1.68', 'memory/max_active (GiB)': '71.13', 'memory/max_allocated (GiB)': '71.13', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '223', 'tokens/total': 1150432256, 'tokens/trainable': 425232896, 'epoch': '1.038'}
 35%|██████████████████████████████                                                         | 606/1751 [10:09:05<19:10:07, 60.27s/it] 35%|██████████████████████████████▏                                                        | 607/1751 [10:10:04<19:03:36, 59.98s/it]                                                                                                                                     {'loss': '0.5232', 'grad_norm': '0.2002', 'learning_rate': '1.557e-05', 'ppl': '1.688', 'memory/max_active (GiB)': '76.3', 'memory/max_allocated (GiB)': '76.3', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '60.83', 'tokens/total': 1152321920, 'tokens/trainable': 425915488, 'epoch': '1.039'}
 35%|██████████████████████████████▏                                                        | 607/1751 [10:10:04<19:03:36, 59.98s/it] 35%|██████████████████████████████▏                                                        | 608/1751 [10:11:04<19:03:51, 60.04s/it]                                                                                                                                     {'loss': '0.5428', 'grad_norm': '0.1973', 'learning_rate': '1.556e-05', 'ppl': '1.721', 'memory/max_active (GiB)': '73.81', 'memory/max_allocated (GiB)': '73.81', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '50.34', 'tokens/total': 1154244608, 'tokens/trainable': 426591744, 'epoch': '1.041'}
 35%|██████████████████████████████▏                                                        | 608/1751 [10:11:04<19:03:51, 60.04s/it] 35%|██████████████████████████████▎                                                        | 609/1751 [10:12:04<19:02:18, 60.02s/it]                                                                                                                                     {'loss': '0.5302', 'grad_norm': '0.1875', 'learning_rate': '1.554e-05', 'ppl': '1.699', 'memory/max_active (GiB)': '71.28', 'memory/max_allocated (GiB)': '71.28', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '36.83', 'tokens/total': 1156138880, 'tokens/trainable': 427280864, 'epoch': '1.043'}
 35%|██████████████████████████████▎                                                        | 609/1751 [10:12:04<19:02:18, 60.02s/it] 35%|██████████████████████████████▎                                                        | 610/1751 [10:13:06<19:11:26, 60.55s/it]                                                                                                                                     {'loss': '0.524', 'grad_norm': '0.208', 'learning_rate': '1.552e-05', 'ppl': '1.689', 'memory/max_active (GiB)': '76.21', 'memory/max_allocated (GiB)': '76.21', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '64.44', 'tokens/total': 1158112896, 'tokens/trainable': 428006656, 'epoch': '1.045'}
 35%|██████████████████████████████▎                                                        | 610/1751 [10:13:06<19:11:26, 60.55s/it] 35%|██████████████████████████████▎                                                        | 611/1751 [10:14:08<19:15:03, 60.79s/it]                                                                                                                                     {'loss': '0.518', 'grad_norm': '0.21', 'learning_rate': '1.551e-05', 'ppl': '1.679', 'memory/max_active (GiB)': '73.1', 'memory/max_allocated (GiB)': '73.1', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '32.94', 'tokens/total': 1160057344, 'tokens/trainable': 428705376, 'epoch': '1.046'}
 35%|██████████████████████████████▎                                                        | 611/1751 [10:14:08<19:15:03, 60.79s/it] 35%|██████████████████████████████▍                                                        | 612/1751 [10:15:10<19:26:03, 61.43s/it]                                                                                                                                     {'loss': '0.5252', 'grad_norm': '0.1904', 'learning_rate': '1.549e-05', 'ppl': '1.691', 'memory/max_active (GiB)': '74.59', 'memory/max_allocated (GiB)': '74.59', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '84.21', 'tokens/total': 1162078976, 'tokens/trainable': 429466464, 'epoch': '1.048'}
 35%|██████████████████████████████▍                                                        | 612/1751 [10:15:10<19:26:03, 61.43s/it] 35%|██████████████████████████████▍                                                        | 613/1751 [10:16:13<19:31:30, 61.77s/it]                                                                                                                                     {'loss': '0.5082', 'grad_norm': '0.1807', 'learning_rate': '1.548e-05', 'ppl': '1.662', 'memory/max_active (GiB)': '76.51', 'memory/max_allocated (GiB)': '76.51', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '43.62', 'tokens/total': 1164055808, 'tokens/trainable': 430186048, 'epoch': '1.05'}
 35%|██████████████████████████████▍                                                        | 613/1751 [10:16:13<19:31:30, 61.77s/it] 35%|██████████████████████████████▌                                                        | 614/1751 [10:17:12<19:14:38, 60.93s/it]                                                                                                                                     {'loss': '0.573', 'grad_norm': '0.207', 'learning_rate': '1.546e-05', 'ppl': '1.774', 'memory/max_active (GiB)': '72.75', 'memory/max_allocated (GiB)': '72.75', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '123.5', 'tokens/total': 1165908352, 'tokens/trainable': 430862912, 'epoch': '1.051'}
 35%|██████████████████████████████▌                                                        | 614/1751 [10:17:12<19:14:38, 60.93s/it] 35%|██████████████████████████████▌                                                        | 615/1751 [10:18:12<19:08:06, 60.64s/it]                                                                                                                                     {'loss': '0.4925', 'grad_norm': '0.1895', 'learning_rate': '1.545e-05', 'ppl': '1.636', 'memory/max_active (GiB)': '76.66', 'memory/max_allocated (GiB)': '76.66', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '158.9', 'tokens/total': 1167803008, 'tokens/trainable': 431565152, 'epoch': '1.053'}
 35%|██████████████████████████████▌                                                        | 615/1751 [10:18:12<19:08:06, 60.64s/it] 35%|██████████████████████████████▌                                                        | 616/1751 [10:19:11<18:57:01, 60.11s/it]                                                                                                                                     {'loss': '0.501', 'grad_norm': '0.2002', 'learning_rate': '1.543e-05', 'ppl': '1.65', 'memory/max_active (GiB)': '74.97', 'memory/max_allocated (GiB)': '74.97', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '59.73', 'tokens/total': 1169646336, 'tokens/trainable': 432217184, 'epoch': '1.055'}
 35%|██████████████████████████████▌                                                        | 616/1751 [10:19:11<18:57:01, 60.11s/it] 35%|██████████████████████████████▋                                                        | 617/1751 [10:20:12<19:03:26, 60.50s/it]                                                                                                                                     {'loss': '0.5035', 'grad_norm': '0.1777', 'learning_rate': '1.541e-05', 'ppl': '1.655', 'memory/max_active (GiB)': '70.37', 'memory/max_allocated (GiB)': '70.37', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '62.05', 'tokens/total': 1171634944, 'tokens/trainable': 432954400, 'epoch': '1.057'}
 35%|██████████████████████████████▋                                                        | 617/1751 [10:20:12<19:03:26, 60.50s/it] 35%|██████████████████████████████▋                                                        | 618/1751 [10:21:12<18:56:34, 60.19s/it]                                                                                                                                     {'loss': '0.5385', 'grad_norm': '0.1826', 'learning_rate': '1.54e-05', 'ppl': '1.713', 'memory/max_active (GiB)': '74.4', 'memory/max_allocated (GiB)': '74.4', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '215.8', 'tokens/total': 1173526400, 'tokens/trainable': 433661216, 'epoch': '1.058'}
 35%|██████████████████████████████▋                                                        | 618/1751 [10:21:12<18:56:34, 60.19s/it] 35%|██████████████████████████████▊                                                        | 619/1751 [10:22:09<18:36:45, 59.19s/it]                                                                                                                                     {'loss': '0.5144', 'grad_norm': '0.2129', 'learning_rate': '1.538e-05', 'ppl': '1.673', 'memory/max_active (GiB)': '71.86', 'memory/max_allocated (GiB)': '71.86', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '126.9', 'tokens/total': 1175284352, 'tokens/trainable': 434291840, 'epoch': '1.06'}
 35%|██████████████████████████████▊                                                        | 619/1751 [10:22:09<18:36:45, 59.19s/it] 35%|██████████████████████████████▊                                                        | 620/1751 [10:23:10<18:47:24, 59.81s/it]                                                                                                                                     {'loss': '0.5036', 'grad_norm': '0.1865', 'learning_rate': '1.537e-05', 'ppl': '1.655', 'memory/max_active (GiB)': '74.44', 'memory/max_allocated (GiB)': '74.44', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '167.4', 'tokens/total': 1177229184, 'tokens/trainable': 434999584, 'epoch': '1.062'}
 35%|██████████████████████████████▊                                                        | 620/1751 [10:23:10<18:47:24, 59.81s/it] 35%|██████████████████████████████▊                                                        | 621/1751 [10:24:10<18:47:30, 59.87s/it]                                                                                                                                     {'loss': '0.5502', 'grad_norm': '0.1973', 'learning_rate': '1.535e-05', 'ppl': '1.734', 'memory/max_active (GiB)': '68.37', 'memory/max_allocated (GiB)': '68.37', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '160.1', 'tokens/total': 1179148032, 'tokens/trainable': 435692384, 'epoch': '1.063'}
 35%|██████████████████████████████▊                                                        | 621/1751 [10:24:10<18:47:30, 59.87s/it] 36%|██████████████████████████████▉                                                        | 622/1751 [10:25:11<18:56:32, 60.40s/it]                                                                                                                                     {'loss': '0.4817', 'grad_norm': '0.1719', 'learning_rate': '1.533e-05', 'ppl': '1.619', 'memory/max_active (GiB)': '76.3', 'memory/max_allocated (GiB)': '76.3', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '155.7', 'tokens/total': 1181110016, 'tokens/trainable': 436423424, 'epoch': '1.065'}
 36%|██████████████████████████████▉                                                        | 622/1751 [10:25:11<18:56:32, 60.40s/it] 36%|██████████████████████████████▉                                                        | 623/1751 [10:26:11<18:52:35, 60.24s/it]                                                                                                                                     {'loss': '0.5165', 'grad_norm': '0.1904', 'learning_rate': '1.532e-05', 'ppl': '1.676', 'memory/max_active (GiB)': '74.59', 'memory/max_allocated (GiB)': '74.59', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '52.31', 'tokens/total': 1182992256, 'tokens/trainable': 437116320, 'epoch': '1.067'}
 36%|██████████████████████████████▉                                                        | 623/1751 [10:26:11<18:52:35, 60.24s/it] 36%|███████████████████████████████                                                        | 624/1751 [10:27:09<18:37:36, 59.50s/it]                                                                                                                                     {'loss': '0.5268', 'grad_norm': '0.209', 'learning_rate': '1.53e-05', 'ppl': '1.694', 'memory/max_active (GiB)': '72.98', 'memory/max_allocated (GiB)': '72.98', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '244', 'tokens/total': 1184781440, 'tokens/trainable': 437790208, 'epoch': '1.069'}
 36%|███████████████████████████████                                                        | 624/1751 [10:27:09<18:37:36, 59.50s/it] 36%|███████████████████████████████                                                        | 625/1751 [10:28:10<18:42:17, 59.80s/it]                                                                                                                                     {'loss': '0.5228', 'grad_norm': '0.1855', 'learning_rate': '1.529e-05', 'ppl': '1.687', 'memory/max_active (GiB)': '73.93', 'memory/max_allocated (GiB)': '73.93', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '54.83', 'tokens/total': 1186703616, 'tokens/trainable': 438484608, 'epoch': '1.07'}
 36%|███████████████████████████████                                                        | 625/1751 [10:28:10<18:42:17, 59.80s/it] 36%|███████████████████████████████                                                        | 626/1751 [10:29:07<18:29:57, 59.20s/it]                                                                                                                                     {'loss': '0.5232', 'grad_norm': '0.2139', 'learning_rate': '1.527e-05', 'ppl': '1.687', 'memory/max_active (GiB)': '67.99', 'memory/max_allocated (GiB)': '67.99', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '31.28', 'tokens/total': 1188555904, 'tokens/trainable': 439137056, 'epoch': '1.072'}
 36%|███████████████████████████████                                                        | 626/1751 [10:29:07<18:29:57, 59.20s/it] 36%|███████████████████████████████▏                                                       | 627/1751 [10:30:08<18:37:21, 59.65s/it]                                                                                                                                     {'loss': '0.5343', 'grad_norm': '0.1865', 'learning_rate': '1.525e-05', 'ppl': '1.706', 'memory/max_active (GiB)': '76.57', 'memory/max_allocated (GiB)': '76.57', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '81.67', 'tokens/total': 1190468480, 'tokens/trainable': 439836256, 'epoch': '1.074'}
 36%|███████████████████████████████▏                                                       | 627/1751 [10:30:08<18:37:21, 59.65s/it] 36%|███████████████████████████████▏                                                       | 628/1751 [10:31:09<18:44:18, 60.07s/it]                                                                                                                                     {'loss': '0.5085', 'grad_norm': '0.1904', 'learning_rate': '1.524e-05', 'ppl': '1.663', 'memory/max_active (GiB)': '75.38', 'memory/max_allocated (GiB)': '75.38', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '59.56', 'tokens/total': 1192376960, 'tokens/trainable': 440576352, 'epoch': '1.075'}
 36%|███████████████████████████████▏                                                       | 628/1751 [10:31:09<18:44:18, 60.07s/it] 36%|███████████████████████████████▎                                                       | 629/1751 [10:32:10<18:50:34, 60.46s/it]                                                                                                                                     {'loss': '0.5336', 'grad_norm': '0.1973', 'learning_rate': '1.522e-05', 'ppl': '1.705', 'memory/max_active (GiB)': '75.15', 'memory/max_allocated (GiB)': '75.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '81.56', 'tokens/total': 1194342912, 'tokens/trainable': 441300864, 'epoch': '1.077'}
 36%|███████████████████████████████▎                                                       | 629/1751 [10:32:11<18:50:34, 60.46s/it] 36%|███████████████████████████████▎                                                       | 630/1751 [10:33:11<18:51:08, 60.54s/it]                                                                                                                                     {'loss': '0.5242', 'grad_norm': '0.2012', 'learning_rate': '1.521e-05', 'ppl': '1.689', 'memory/max_active (GiB)': '77.36', 'memory/max_allocated (GiB)': '77.36', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '111.6', 'tokens/total': 1196243840, 'tokens/trainable': 441986848, 'epoch': '1.079'}
 36%|███████████████████████████████▎                                                       | 630/1751 [10:33:11<18:51:08, 60.54s/it] 36%|███████████████████████████████▎                                                       | 631/1751 [10:34:07<18:24:16, 59.16s/it]                                                                                                                                     {'loss': '0.5479', 'grad_norm': '0.208', 'learning_rate': '1.519e-05', 'ppl': '1.73', 'memory/max_active (GiB)': '73.86', 'memory/max_allocated (GiB)': '73.86', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '194.6', 'tokens/total': 1197950976, 'tokens/trainable': 442622880, 'epoch': '1.081'}
 36%|███████████████████████████████▎                                                       | 631/1751 [10:34:07<18:24:16, 59.16s/it] 36%|███████████████████████████████▍                                                       | 632/1751 [10:35:05<18:14:34, 58.69s/it]                                                                                                                                     {'loss': '0.5563', 'grad_norm': '0.2119', 'learning_rate': '1.517e-05', 'ppl': '1.744', 'memory/max_active (GiB)': '72.85', 'memory/max_allocated (GiB)': '72.85', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '159', 'tokens/total': 1199718016, 'tokens/trainable': 443287872, 'epoch': '1.082'}
 36%|███████████████████████████████▍                                                       | 632/1751 [10:35:05<18:14:34, 58.69s/it] 36%|███████████████████████████████▍                                                       | 633/1751 [10:36:08<18:37:46, 59.99s/it]                                                                                                                                     {'loss': '0.4906', 'grad_norm': '0.2012', 'learning_rate': '1.516e-05', 'ppl': '1.633', 'memory/max_active (GiB)': '71.63', 'memory/max_allocated (GiB)': '71.63', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '125.6', 'tokens/total': 1201755904, 'tokens/trainable': 444061920, 'epoch': '1.084'}
 36%|███████████████████████████████▍                                                       | 633/1751 [10:36:08<18:37:46, 59.99s/it] 36%|███████████████████████████████▌                                                       | 634/1751 [10:37:09<18:42:20, 60.29s/it]                                                                                                                                     {'loss': '0.5138', 'grad_norm': '0.209', 'learning_rate': '1.514e-05', 'ppl': '1.672', 'memory/max_active (GiB)': '71.46', 'memory/max_allocated (GiB)': '71.46', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '199.6', 'tokens/total': 1203698304, 'tokens/trainable': 444809056, 'epoch': '1.086'}
 36%|███████████████████████████████▌                                                       | 634/1751 [10:37:09<18:42:20, 60.29s/it] 36%|███████████████████████████████▌                                                       | 635/1751 [10:38:10<18:46:06, 60.54s/it]                                                                                                                                     {'loss': '0.534', 'grad_norm': '0.1895', 'learning_rate': '1.512e-05', 'ppl': '1.706', 'memory/max_active (GiB)': '74.43', 'memory/max_allocated (GiB)': '74.43', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '109', 'tokens/total': 1205643904, 'tokens/trainable': 445527392, 'epoch': '1.087'}
 36%|███████████████████████████████▌                                                       | 635/1751 [10:38:10<18:46:06, 60.54s/it] 36%|███████████████████████████████▌                                                       | 636/1751 [10:39:11<18:46:02, 60.59s/it]                                                                                                                                     {'loss': '0.5441', 'grad_norm': '0.1953', 'learning_rate': '1.511e-05', 'ppl': '1.723', 'memory/max_active (GiB)': '74.94', 'memory/max_allocated (GiB)': '74.94', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '71.26', 'tokens/total': 1207548800, 'tokens/trainable': 446232416, 'epoch': '1.089'}
 36%|███████████████████████████████▌                                                       | 636/1751 [10:39:11<18:46:02, 60.59s/it] 36%|███████████████████████████████▋                                                       | 637/1751 [10:40:09<18:34:51, 60.05s/it]                                                                                                                                     {'loss': '0.5674', 'grad_norm': '0.2139', 'learning_rate': '1.509e-05', 'ppl': '1.764', 'memory/max_active (GiB)': '75.97', 'memory/max_allocated (GiB)': '75.97', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '72.31', 'tokens/total': 1209419520, 'tokens/trainable': 446871328, 'epoch': '1.091'}
 36%|███████████████████████████████▋                                                       | 637/1751 [10:40:09<18:34:51, 60.05s/it] 36%|███████████████████████████████▋                                                       | 638/1751 [10:41:12<18:47:09, 60.76s/it]                                                                                                                                     {'loss': '0.4946', 'grad_norm': '0.1797', 'learning_rate': '1.508e-05', 'ppl': '1.64', 'memory/max_active (GiB)': '76.25', 'memory/max_allocated (GiB)': '76.25', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '147.1', 'tokens/total': 1211409792, 'tokens/trainable': 447614912, 'epoch': '1.093'}
 36%|███████████████████████████████▋                                                       | 638/1751 [10:41:12<18:47:09, 60.76s/it] 36%|███████████████████████████████▋                                                       | 639/1751 [10:42:12<18:40:21, 60.45s/it]                                                                                                                                     {'loss': '0.5405', 'grad_norm': '0.2002', 'learning_rate': '1.506e-05', 'ppl': '1.717', 'memory/max_active (GiB)': '71.21', 'memory/max_allocated (GiB)': '71.21', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '77.66', 'tokens/total': 1213286528, 'tokens/trainable': 448288736, 'epoch': '1.094'}
 36%|███████████████████████████████▋                                                       | 639/1751 [10:42:12<18:40:21, 60.45s/it] 37%|███████████████████████████████▊                                                       | 640/1751 [10:43:09<18:25:14, 59.69s/it]                                                                                                                                     {'loss': '0.5202', 'grad_norm': '0.1865', 'learning_rate': '1.504e-05', 'ppl': '1.682', 'memory/max_active (GiB)': '63.87', 'memory/max_allocated (GiB)': '63.87', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '138.8', 'tokens/total': 1215098368, 'tokens/trainable': 448954080, 'epoch': '1.096'}
 37%|███████████████████████████████▊                                                       | 640/1751 [10:43:09<18:25:14, 59.69s/it] 37%|███████████████████████████████▊                                                       | 641/1751 [10:44:12<18:41:01, 60.60s/it]                                                                                                                                     {'loss': '0.5101', 'grad_norm': '0.1904', 'learning_rate': '1.503e-05', 'ppl': '1.665', 'memory/max_active (GiB)': '75.04', 'memory/max_allocated (GiB)': '75.04', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '98.38', 'tokens/total': 1217097344, 'tokens/trainable': 449696960, 'epoch': '1.098'}
 37%|███████████████████████████████▊                                                       | 641/1751 [10:44:12<18:41:01, 60.60s/it] 37%|███████████████████████████████▉                                                       | 642/1751 [10:45:12<18:36:17, 60.39s/it]                                                                                                                                     {'loss': '0.529', 'grad_norm': '0.1846', 'learning_rate': '1.501e-05', 'ppl': '1.697', 'memory/max_active (GiB)': '73.7', 'memory/max_allocated (GiB)': '73.7', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '59.14', 'tokens/total': 1218986624, 'tokens/trainable': 450392096, 'epoch': '1.099'}
 37%|███████████████████████████████▉                                                       | 642/1751 [10:45:12<18:36:17, 60.39s/it] 37%|███████████████████████████████▉                                                       | 643/1751 [10:46:11<18:27:49, 59.99s/it]                                                                                                                                     {'loss': '0.5317', 'grad_norm': '0.1904', 'learning_rate': '1.499e-05', 'ppl': '1.702', 'memory/max_active (GiB)': '74.12', 'memory/max_allocated (GiB)': '74.12', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '53.48', 'tokens/total': 1220863488, 'tokens/trainable': 451074912, 'epoch': '1.101'}
 37%|███████████████████████████████▉                                                       | 643/1751 [10:46:11<18:27:49, 59.99s/it] 37%|███████████████████████████████▉                                                       | 644/1751 [10:47:11<18:27:05, 60.00s/it]                                                                                                                                     {'loss': '0.5142', 'grad_norm': '0.2041', 'learning_rate': '1.498e-05', 'ppl': '1.672', 'memory/max_active (GiB)': '73.09', 'memory/max_allocated (GiB)': '73.09', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '13.77', 'tokens/total': 1222779648, 'tokens/trainable': 451769600, 'epoch': '1.103'}
 37%|███████████████████████████████▉                                                       | 644/1751 [10:47:11<18:27:05, 60.00s/it] 37%|████████████████████████████████                                                       | 645/1751 [10:48:09<18:12:09, 59.25s/it]                                                                                                                                     {'loss': '0.5227', 'grad_norm': '0.208', 'learning_rate': '1.496e-05', 'ppl': '1.687', 'memory/max_active (GiB)': '69.57', 'memory/max_allocated (GiB)': '69.57', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '99.04', 'tokens/total': 1224575360, 'tokens/trainable': 452388352, 'epoch': '1.105'}
 37%|████████████████████████████████                                                       | 645/1751 [10:48:09<18:12:09, 59.25s/it] 37%|████████████████████████████████                                                       | 646/1751 [10:49:07<18:05:46, 58.96s/it]                                                                                                                                     {'loss': '0.5184', 'grad_norm': '0.2041', 'learning_rate': '1.495e-05', 'ppl': '1.679', 'memory/max_active (GiB)': '75.83', 'memory/max_allocated (GiB)': '75.83', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '148.9', 'tokens/total': 1226385536, 'tokens/trainable': 453070752, 'epoch': '1.106'}
 37%|████████████████████████████████                                                       | 646/1751 [10:49:07<18:05:46, 58.96s/it] 37%|████████████████████████████████▏                                                      | 647/1751 [10:50:07<18:08:59, 59.18s/it]                                                                                                                                     {'loss': '0.5142', 'grad_norm': '0.1836', 'learning_rate': '1.493e-05', 'ppl': '1.672', 'memory/max_active (GiB)': '72.03', 'memory/max_allocated (GiB)': '72.03', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '20.14', 'tokens/total': 1228284800, 'tokens/trainable': 453786400, 'epoch': '1.108'}
 37%|████████████████████████████████▏                                                      | 647/1751 [10:50:07<18:08:59, 59.18s/it] 37%|████████████████████████████████▏                                                      | 648/1751 [10:51:06<18:08:33, 59.21s/it]                                                                                                                                     {'loss': '0.5407', 'grad_norm': '0.1992', 'learning_rate': '1.491e-05', 'ppl': '1.717', 'memory/max_active (GiB)': '71.2', 'memory/max_allocated (GiB)': '71.2', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '55.6', 'tokens/total': 1230171264, 'tokens/trainable': 454461472, 'epoch': '1.11'}
 37%|████████████████████████████████▏                                                      | 648/1751 [10:51:06<18:08:33, 59.21s/it] 37%|████████████████████████████████▏                                                      | 649/1751 [10:52:06<18:14:12, 59.58s/it]                                                                                                                                     {'loss': '0.5235', 'grad_norm': '0.1924', 'learning_rate': '1.49e-05', 'ppl': '1.688', 'memory/max_active (GiB)': '69.78', 'memory/max_allocated (GiB)': '69.78', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '25.49', 'tokens/total': 1232098432, 'tokens/trainable': 455173536, 'epoch': '1.111'}
 37%|████████████████████████████████▏                                                      | 649/1751 [10:52:06<18:14:12, 59.58s/it] 37%|████████████████████████████████▎                                                      | 650/1751 [10:53:08<18:26:25, 60.30s/it]                                                                                                                                     {'loss': '0.5155', 'grad_norm': '0.1875', 'learning_rate': '1.488e-05', 'ppl': '1.674', 'memory/max_active (GiB)': '71.61', 'memory/max_allocated (GiB)': '71.61', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '119.9', 'tokens/total': 1234060544, 'tokens/trainable': 455892768, 'epoch': '1.113'}
 37%|████████████████████████████████▎                                                      | 650/1751 [10:53:08<18:26:25, 60.30s/it] 37%|████████████████████████████████▎                                                      | 651/1751 [10:54:08<18:19:50, 59.99s/it]                                                                                                                                     {'loss': '0.5269', 'grad_norm': '0.1943', 'learning_rate': '1.486e-05', 'ppl': '1.694', 'memory/max_active (GiB)': '71.51', 'memory/max_allocated (GiB)': '71.51', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '21.36', 'tokens/total': 1235933440, 'tokens/trainable': 456583328, 'epoch': '1.115'}
 37%|████████████████████████████████▎                                                      | 651/1751 [10:54:08<18:19:50, 59.99s/it] 37%|████████████████████████████████▍                                                      | 652/1751 [10:55:07<18:13:52, 59.72s/it]                                                                                                                                     {'loss': '0.5267', 'grad_norm': '0.1953', 'learning_rate': '1.485e-05', 'ppl': '1.693', 'memory/max_active (GiB)': '72.33', 'memory/max_allocated (GiB)': '72.33', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '165.3', 'tokens/total': 1237793280, 'tokens/trainable': 457245760, 'epoch': '1.117'}
 37%|████████████████████████████████▍                                                      | 652/1751 [10:55:07<18:13:52, 59.72s/it] 37%|████████████████████████████████▍                                                      | 653/1751 [10:56:09<18:29:42, 60.64s/it]                                                                                                                                     {'loss': '0.4962', 'grad_norm': '0.1777', 'learning_rate': '1.483e-05', 'ppl': '1.643', 'memory/max_active (GiB)': '72.01', 'memory/max_allocated (GiB)': '72.01', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '64.79', 'tokens/total': 1239792384, 'tokens/trainable': 457981024, 'epoch': '1.118'}
 37%|████████████████████████████████▍                                                      | 653/1751 [10:56:09<18:29:42, 60.64s/it] 37%|████████████████████████████████▍                                                      | 654/1751 [10:57:10<18:30:30, 60.74s/it]                                                                                                                                     {'loss': '0.4879', 'grad_norm': '0.1846', 'learning_rate': '1.481e-05', 'ppl': '1.629', 'memory/max_active (GiB)': '75.02', 'memory/max_allocated (GiB)': '75.02', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '71.05', 'tokens/total': 1241699200, 'tokens/trainable': 458706432, 'epoch': '1.12'}
 37%|████████████████████████████████▍                                                      | 654/1751 [10:57:10<18:30:30, 60.74s/it] 37%|████████████████████████████████▌                                                      | 655/1751 [10:58:11<18:27:03, 60.61s/it]                                                                                                                                     {'loss': '0.5399', 'grad_norm': '0.1885', 'learning_rate': '1.48e-05', 'ppl': '1.716', 'memory/max_active (GiB)': '73.49', 'memory/max_allocated (GiB)': '73.49', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '100.8', 'tokens/total': 1243591552, 'tokens/trainable': 459404416, 'epoch': '1.122'}
 37%|████████████████████████████████▌                                                      | 655/1751 [10:58:11<18:27:03, 60.61s/it] 37%|████████████████████████████████▌                                                      | 656/1751 [10:59:09<18:13:59, 59.94s/it]                                                                                                                                     {'loss': '0.5539', 'grad_norm': '0.207', 'learning_rate': '1.478e-05', 'ppl': '1.74', 'memory/max_active (GiB)': '74.7', 'memory/max_allocated (GiB)': '74.7', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '64.74', 'tokens/total': 1245411200, 'tokens/trainable': 460055360, 'epoch': '1.123'}
 37%|████████████████████████████████▌                                                      | 656/1751 [10:59:09<18:13:59, 59.94s/it] 38%|████████████████████████████████▋                                                      | 657/1751 [11:00:08<18:06:13, 59.57s/it]                                                                                                                                     {'loss': '0.5257', 'grad_norm': '0.1865', 'learning_rate': '1.476e-05', 'ppl': '1.692', 'memory/max_active (GiB)': '74.16', 'memory/max_allocated (GiB)': '74.16', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '73.82', 'tokens/total': 1247246592, 'tokens/trainable': 460738112, 'epoch': '1.125'}
 38%|████████████████████████████████▋                                                      | 657/1751 [11:00:08<18:06:13, 59.57s/it] 38%|████████████████████████████████▋                                                      | 658/1751 [11:01:08<18:08:03, 59.73s/it]                                                                                                                                     {'loss': '0.4976', 'grad_norm': '0.1914', 'learning_rate': '1.475e-05', 'ppl': '1.645', 'memory/max_active (GiB)': '77.14', 'memory/max_allocated (GiB)': '77.14', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '152.5', 'tokens/total': 1249162496, 'tokens/trainable': 461449984, 'epoch': '1.127'}
 38%|████████████████████████████████▋                                                      | 658/1751 [11:01:08<18:08:03, 59.73s/it] 38%|████████████████████████████████▋                                                      | 659/1751 [11:02:06<17:55:59, 59.12s/it]                                                                                                                                     {'loss': '0.5485', 'grad_norm': '0.207', 'learning_rate': '1.473e-05', 'ppl': '1.731', 'memory/max_active (GiB)': '70.15', 'memory/max_allocated (GiB)': '70.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '65.28', 'tokens/total': 1250923264, 'tokens/trainable': 462123104, 'epoch': '1.129'}
 38%|████████████████████████████████▋                                                      | 659/1751 [11:02:06<17:55:59, 59.12s/it] 38%|████████████████████████████████▊                                                      | 660/1751 [11:03:05<17:55:56, 59.17s/it]                                                                                                                                     {'loss': '0.509', 'grad_norm': '0.2021', 'learning_rate': '1.471e-05', 'ppl': '1.664', 'memory/max_active (GiB)': '73.47', 'memory/max_allocated (GiB)': '73.47', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '18.96', 'tokens/total': 1252765184, 'tokens/trainable': 462799840, 'epoch': '1.13'}
 38%|████████████████████████████████▊                                                      | 660/1751 [11:03:05<17:55:56, 59.17s/it] 38%|████████████████████████████████▊                                                      | 661/1751 [11:04:11<18:31:32, 61.19s/it]                                                                                                                                     {'loss': '0.489', 'grad_norm': '0.1836', 'learning_rate': '1.47e-05', 'ppl': '1.631', 'memory/max_active (GiB)': '74.96', 'memory/max_allocated (GiB)': '74.96', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '62.33', 'tokens/total': 1254873856, 'tokens/trainable': 463585632, 'epoch': '1.132'}
 38%|████████████████████████████████▊                                                      | 661/1751 [11:04:11<18:31:32, 61.19s/it] 38%|████████████████████████████████▉                                                      | 662/1751 [11:05:12<18:28:15, 61.06s/it]                                                                                                                                     {'loss': '0.5124', 'grad_norm': '0.1904', 'learning_rate': '1.468e-05', 'ppl': '1.669', 'memory/max_active (GiB)': '76.32', 'memory/max_allocated (GiB)': '76.32', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '135', 'tokens/total': 1256824192, 'tokens/trainable': 464296480, 'epoch': '1.134'}
 38%|████████████████████████████████▉                                                      | 662/1751 [11:05:12<18:28:15, 61.06s/it] 38%|████████████████████████████████▉                                                      | 663/1751 [11:06:09<18:09:10, 60.07s/it]                                                                                                                                     {'loss': '0.5548', 'grad_norm': '0.1953', 'learning_rate': '1.466e-05', 'ppl': '1.742', 'memory/max_active (GiB)': '76.29', 'memory/max_allocated (GiB)': '76.29', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '80.21', 'tokens/total': 1258615168, 'tokens/trainable': 464982656, 'epoch': '1.135'}
 38%|████████████████████████████████▉                                                      | 663/1751 [11:06:09<18:09:10, 60.07s/it] 38%|████████████████████████████████▉                                                      | 664/1751 [11:07:12<18:22:05, 60.83s/it]                                                                                                                                     {'loss': '0.4861', 'grad_norm': '0.1836', 'learning_rate': '1.465e-05', 'ppl': '1.626', 'memory/max_active (GiB)': '74.86', 'memory/max_allocated (GiB)': '74.86', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '71.81', 'tokens/total': 1260626944, 'tokens/trainable': 465744992, 'epoch': '1.137'}
 38%|████████████████████████████████▉                                                      | 664/1751 [11:07:12<18:22:05, 60.83s/it] 38%|█████████████████████████████████                                                      | 665/1751 [11:08:13<18:23:01, 60.94s/it]                                                                                                                                     {'loss': '0.5498', 'grad_norm': '0.1865', 'learning_rate': '1.463e-05', 'ppl': '1.733', 'memory/max_active (GiB)': '73.53', 'memory/max_allocated (GiB)': '73.53', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '108.3', 'tokens/total': 1262564480, 'tokens/trainable': 466459744, 'epoch': '1.139'}
 38%|█████████████████████████████████                                                      | 665/1751 [11:08:13<18:23:01, 60.94s/it] 38%|█████████████████████████████████                                                      | 666/1751 [11:09:10<18:00:14, 59.74s/it]                                                                                                                                     {'loss': '0.5539', 'grad_norm': '0.2178', 'learning_rate': '1.461e-05', 'ppl': '1.74', 'memory/max_active (GiB)': '76.57', 'memory/max_allocated (GiB)': '76.57', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '130.4', 'tokens/total': 1264355328, 'tokens/trainable': 467093056, 'epoch': '1.141'}
 38%|█████████████████████████████████                                                      | 666/1751 [11:09:10<18:00:14, 59.74s/it] 38%|█████████████████████████████████▏                                                     | 667/1751 [11:10:13<18:15:20, 60.63s/it]                                                                                                                                     {'loss': '0.5214', 'grad_norm': '0.1943', 'learning_rate': '1.46e-05', 'ppl': '1.684', 'memory/max_active (GiB)': '75.73', 'memory/max_allocated (GiB)': '75.73', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '106.8', 'tokens/total': 1266374656, 'tokens/trainable': 467846016, 'epoch': '1.142'}
 38%|█████████████████████████████████▏                                                     | 667/1751 [11:10:13<18:15:20, 60.63s/it] 38%|█████████████████████████████████▏                                                     | 668/1751 [11:11:11<18:02:26, 59.97s/it]                                                                                                                                     {'loss': '0.5797', 'grad_norm': '0.2119', 'learning_rate': '1.458e-05', 'ppl': '1.786', 'memory/max_active (GiB)': '73.89', 'memory/max_allocated (GiB)': '73.89', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '68.47', 'tokens/total': 1268167936, 'tokens/trainable': 468465024, 'epoch': '1.144'}
 38%|█████████████████████████████████▏                                                     | 668/1751 [11:11:11<18:02:26, 59.97s/it] 38%|█████████████████████████████████▏                                                     | 669/1751 [11:12:10<17:54:12, 59.57s/it]                                                                                                                                     {'loss': '0.5704', 'grad_norm': '0.2178', 'learning_rate': '1.456e-05', 'ppl': '1.769', 'memory/max_active (GiB)': '74.92', 'memory/max_allocated (GiB)': '74.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '46.51', 'tokens/total': 1270000256, 'tokens/trainable': 469151904, 'epoch': '1.146'}
 38%|█████████████████████████████████▏                                                     | 669/1751 [11:12:10<17:54:12, 59.57s/it] 38%|█████████████████████████████████▎                                                     | 670/1751 [11:13:08<17:45:06, 59.12s/it]                                                                                                                                     {'loss': '0.5588', 'grad_norm': '0.2002', 'learning_rate': '1.455e-05', 'ppl': '1.749', 'memory/max_active (GiB)': '64.82', 'memory/max_allocated (GiB)': '64.82', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '103.2', 'tokens/total': 1271812224, 'tokens/trainable': 469803520, 'epoch': '1.147'}
 38%|█████████████████████████████████▎                                                     | 670/1751 [11:13:08<17:45:06, 59.12s/it] 38%|█████████████████████████████████▎                                                     | 671/1751 [11:14:07<17:44:28, 59.14s/it]                                                                                                                                     {'loss': '0.5027', 'grad_norm': '0.1953', 'learning_rate': '1.453e-05', 'ppl': '1.653', 'memory/max_active (GiB)': '72.76', 'memory/max_allocated (GiB)': '72.76', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '143.8', 'tokens/total': 1273651840, 'tokens/trainable': 470509728, 'epoch': '1.149'}
 38%|█████████████████████████████████▎                                                     | 671/1751 [11:14:07<17:44:28, 59.14s/it] 38%|█████████████████████████████████▍                                                     | 672/1751 [11:15:10<18:02:29, 60.19s/it]                                                                                                                                     {'loss': '0.5502', 'grad_norm': '0.2129', 'learning_rate': '1.451e-05', 'ppl': '1.734', 'memory/max_active (GiB)': '77.88', 'memory/max_allocated (GiB)': '77.88', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '74.36', 'tokens/total': 1275648256, 'tokens/trainable': 471259456, 'epoch': '1.151'}
 38%|█████████████████████████████████▍                                                     | 672/1751 [11:15:10<18:02:29, 60.19s/it] 38%|█████████████████████████████████▍                                                     | 673/1751 [11:16:09<17:55:50, 59.88s/it]                                                                                                                                     {'loss': '0.5004', 'grad_norm': '0.2158', 'learning_rate': '1.45e-05', 'ppl': '1.649', 'memory/max_active (GiB)': '72.77', 'memory/max_allocated (GiB)': '72.77', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '32.71', 'tokens/total': 1277482880, 'tokens/trainable': 471906720, 'epoch': '1.153'}
 38%|█████████████████████████████████▍                                                     | 673/1751 [11:16:09<17:55:50, 59.88s/it] 38%|█████████████████████████████████▍                                                     | 674/1751 [11:17:08<17:52:18, 59.74s/it]                                                                                                                                     {'loss': '0.5232', 'grad_norm': '0.1914', 'learning_rate': '1.448e-05', 'ppl': '1.687', 'memory/max_active (GiB)': '72.26', 'memory/max_allocated (GiB)': '72.26', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '156.9', 'tokens/total': 1279376256, 'tokens/trainable': 472580736, 'epoch': '1.154'}
 38%|█████████████████████████████████▍                                                     | 674/1751 [11:17:08<17:52:18, 59.74s/it] 39%|█████████████████████████████████▌                                                     | 675/1751 [11:18:09<17:57:38, 60.09s/it]                                                                                                                                     {'loss': '0.5093', 'grad_norm': '0.1895', 'learning_rate': '1.446e-05', 'ppl': '1.664', 'memory/max_active (GiB)': '71.96', 'memory/max_allocated (GiB)': '71.96', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '86.65', 'tokens/total': 1281298432, 'tokens/trainable': 473292544, 'epoch': '1.156'}
 39%|█████████████████████████████████▌                                                     | 675/1751 [11:18:09<17:57:38, 60.09s/it] 39%|█████████████████████████████████▌                                                     | 676/1751 [11:19:08<17:47:43, 59.59s/it]                                                                                                                                     {'loss': '0.5061', 'grad_norm': '0.1943', 'learning_rate': '1.445e-05', 'ppl': '1.659', 'memory/max_active (GiB)': '74.82', 'memory/max_allocated (GiB)': '74.82', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '53.56', 'tokens/total': 1283151360, 'tokens/trainable': 473964128, 'epoch': '1.158'}
 39%|█████████████████████████████████▌                                                     | 676/1751 [11:19:08<17:47:43, 59.59s/it] 39%|█████████████████████████████████▋                                                     | 677/1751 [11:20:09<17:55:53, 60.11s/it]                                                                                                                                     {'loss': '0.532', 'grad_norm': '0.1943', 'learning_rate': '1.443e-05', 'ppl': '1.702', 'memory/max_active (GiB)': '69.74', 'memory/max_allocated (GiB)': '69.74', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '50.89', 'tokens/total': 1285093888, 'tokens/trainable': 474671424, 'epoch': '1.159'}
 39%|█████████████████████████████████▋                                                     | 677/1751 [11:20:09<17:55:53, 60.11s/it] 39%|█████████████████████████████████▋                                                     | 678/1751 [11:21:12<18:09:09, 60.90s/it]                                                                                                                                     {'loss': '0.5106', 'grad_norm': '0.1885', 'learning_rate': '1.441e-05', 'ppl': '1.666', 'memory/max_active (GiB)': '74.55', 'memory/max_allocated (GiB)': '74.55', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '82.3', 'tokens/total': 1287073408, 'tokens/trainable': 475379104, 'epoch': '1.161'}
 39%|█████████████████████████████████▋                                                     | 678/1751 [11:21:12<18:09:09, 60.90s/it] 39%|█████████████████████████████████▋                                                     | 679/1751 [11:22:14<18:16:33, 61.37s/it]                                                                                                                                     {'loss': '0.5128', 'grad_norm': '0.1797', 'learning_rate': '1.439e-05', 'ppl': '1.67', 'memory/max_active (GiB)': '76.44', 'memory/max_allocated (GiB)': '76.44', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '244', 'tokens/total': 1289037952, 'tokens/trainable': 476106944, 'epoch': '1.163'}
 39%|█████████████████████████████████▋                                                     | 679/1751 [11:22:14<18:16:33, 61.37s/it] 39%|█████████████████████████████████▊                                                     | 680/1751 [11:23:11<17:52:49, 60.10s/it]                                                                                                                                     {'loss': '0.5548', 'grad_norm': '0.1982', 'learning_rate': '1.438e-05', 'ppl': '1.742', 'memory/max_active (GiB)': '71.5', 'memory/max_allocated (GiB)': '71.5', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '70.33', 'tokens/total': 1290781440, 'tokens/trainable': 476759168, 'epoch': '1.165'}
 39%|█████████████████████████████████▊                                                     | 680/1751 [11:23:11<17:52:49, 60.10s/it] 39%|█████████████████████████████████▊                                                     | 681/1751 [11:24:14<18:03:26, 60.75s/it]                                                                                                                                     {'loss': '0.5093', 'grad_norm': '0.1992', 'learning_rate': '1.436e-05', 'ppl': '1.664', 'memory/max_active (GiB)': '74.84', 'memory/max_allocated (GiB)': '74.84', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '165.4', 'tokens/total': 1292785408, 'tokens/trainable': 477518208, 'epoch': '1.166'}
 39%|█████████████████████████████████▊                                                     | 681/1751 [11:24:14<18:03:26, 60.75s/it] 39%|█████████████████████████████████▉                                                     | 682/1751 [11:25:14<18:00:55, 60.67s/it]                                                                                                                                     {'loss': '0.5304', 'grad_norm': '0.2188', 'learning_rate': '1.434e-05', 'ppl': '1.7', 'memory/max_active (GiB)': '71.25', 'memory/max_allocated (GiB)': '71.25', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '73.11', 'tokens/total': 1294712832, 'tokens/trainable': 478227904, 'epoch': '1.168'}
 39%|█████████████████████████████████▉                                                     | 682/1751 [11:25:14<18:00:55, 60.67s/it] 39%|█████████████████████████████████▉                                                     | 683/1751 [11:26:13<17:51:20, 60.19s/it]                                                                                                                                     {'loss': '0.5462', 'grad_norm': '0.2061', 'learning_rate': '1.433e-05', 'ppl': '1.727', 'memory/max_active (GiB)': '75.78', 'memory/max_allocated (GiB)': '75.78', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '255.6', 'tokens/total': 1296546560, 'tokens/trainable': 478888448, 'epoch': '1.17'}
 39%|█████████████████████████████████▉                                                     | 683/1751 [11:26:13<17:51:20, 60.19s/it] 39%|█████████████████████████████████▉                                                     | 684/1751 [11:27:10<17:33:47, 59.26s/it]                                                                                                                                     {'loss': '0.5611', 'grad_norm': '0.209', 'learning_rate': '1.431e-05', 'ppl': '1.753', 'memory/max_active (GiB)': '73.45', 'memory/max_allocated (GiB)': '73.45', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '108.6', 'tokens/total': 1298281728, 'tokens/trainable': 479521504, 'epoch': '1.171'}
 39%|█████████████████████████████████▉                                                     | 684/1751 [11:27:10<17:33:47, 59.26s/it] 39%|██████████████████████████████████                                                     | 685/1751 [11:28:13<17:49:24, 60.19s/it]                                                                                                                                     {'loss': '0.5022', 'grad_norm': '0.1875', 'learning_rate': '1.429e-05', 'ppl': '1.652', 'memory/max_active (GiB)': '73.37', 'memory/max_allocated (GiB)': '73.37', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '46.98', 'tokens/total': 1300260096, 'tokens/trainable': 480239616, 'epoch': '1.173'}
 39%|██████████████████████████████████                                                     | 685/1751 [11:28:13<17:49:24, 60.19s/it] 39%|██████████████████████████████████                                                     | 686/1751 [11:29:09<17:29:26, 59.12s/it]                                                                                                                                     {'loss': '0.5204', 'grad_norm': '0.1934', 'learning_rate': '1.428e-05', 'ppl': '1.683', 'memory/max_active (GiB)': '71.87', 'memory/max_allocated (GiB)': '71.87', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '85.14', 'tokens/total': 1302028800, 'tokens/trainable': 480914816, 'epoch': '1.175'}
 39%|██████████████████████████████████                                                     | 686/1751 [11:29:09<17:29:26, 59.12s/it] 39%|██████████████████████████████████▏                                                    | 687/1751 [11:30:11<17:41:38, 59.87s/it]                                                                                                                                     {'loss': '0.5015', 'grad_norm': '0.1992', 'learning_rate': '1.426e-05', 'ppl': '1.651', 'memory/max_active (GiB)': '70.32', 'memory/max_allocated (GiB)': '70.32', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '79.46', 'tokens/total': 1303989504, 'tokens/trainable': 481605632, 'epoch': '1.176'}
 39%|██████████████████████████████████▏                                                    | 687/1751 [11:30:11<17:41:38, 59.87s/it] 39%|██████████████████████████████████▏                                                    | 688/1751 [11:31:10<17:37:46, 59.71s/it]                                                                                                                                     {'loss': '0.5221', 'grad_norm': '0.2041', 'learning_rate': '1.424e-05', 'ppl': '1.686', 'memory/max_active (GiB)': '72.88', 'memory/max_allocated (GiB)': '72.88', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '122.5', 'tokens/total': 1305860096, 'tokens/trainable': 482303968, 'epoch': '1.178'}
 39%|██████████████████████████████████▏                                                    | 688/1751 [11:31:10<17:37:46, 59.71s/it] 39%|██████████████████████████████████▏                                                    | 689/1751 [11:32:10<17:38:58, 59.83s/it]                                                                                                                                     {'loss': '0.5152', 'grad_norm': '0.1963', 'learning_rate': '1.422e-05', 'ppl': '1.674', 'memory/max_active (GiB)': '73.43', 'memory/max_allocated (GiB)': '73.43', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '59.93', 'tokens/total': 1307769728, 'tokens/trainable': 482997504, 'epoch': '1.18'}
 39%|██████████████████████████████████▏                                                    | 689/1751 [11:32:10<17:38:58, 59.83s/it] 39%|██████████████████████████████████▎                                                    | 690/1751 [11:33:09<17:33:37, 59.58s/it]                                                                                                                                     {'loss': '0.5118', 'grad_norm': '0.1895', 'learning_rate': '1.421e-05', 'ppl': '1.668', 'memory/max_active (GiB)': '75.06', 'memory/max_allocated (GiB)': '75.06', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '52.16', 'tokens/total': 1309628160, 'tokens/trainable': 483683008, 'epoch': '1.182'}
 39%|██████████████████████████████████▎                                                    | 690/1751 [11:33:09<17:33:37, 59.58s/it] 39%|██████████████████████████████████▎                                                    | 691/1751 [11:34:12<17:46:53, 60.39s/it]                                                                                                                                     {'loss': '0.5025', 'grad_norm': '0.2021', 'learning_rate': '1.419e-05', 'ppl': '1.653', 'memory/max_active (GiB)': '75.08', 'memory/max_allocated (GiB)': '75.08', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '137.2', 'tokens/total': 1311586048, 'tokens/trainable': 484396288, 'epoch': '1.183'}
 39%|██████████████████████████████████▎                                                    | 691/1751 [11:34:12<17:46:53, 60.39s/it] 40%|██████████████████████████████████▍                                                    | 692/1751 [11:35:13<17:51:11, 60.69s/it]                                                                                                                                     {'loss': '0.5172', 'grad_norm': '0.1807', 'learning_rate': '1.417e-05', 'ppl': '1.677', 'memory/max_active (GiB)': '75.13', 'memory/max_allocated (GiB)': '75.13', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '114.2', 'tokens/total': 1313545856, 'tokens/trainable': 485132384, 'epoch': '1.185'}
 40%|██████████████████████████████████▍                                                    | 692/1751 [11:35:13<17:51:11, 60.69s/it] 40%|██████████████████████████████████▍                                                    | 693/1751 [11:36:12<17:39:08, 60.06s/it]                                                                                                                                     {'loss': '0.5291', 'grad_norm': '0.1982', 'learning_rate': '1.416e-05', 'ppl': '1.697', 'memory/max_active (GiB)': '69.56', 'memory/max_allocated (GiB)': '69.56', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '69.2', 'tokens/total': 1315362944, 'tokens/trainable': 485794944, 'epoch': '1.187'}
 40%|██████████████████████████████████▍                                                    | 693/1751 [11:36:12<17:39:08, 60.06s/it] 40%|██████████████████████████████████▍                                                    | 694/1751 [11:37:12<17:42:31, 60.31s/it]                                                                                                                                     {'loss': '0.5265', 'grad_norm': '0.1787', 'learning_rate': '1.414e-05', 'ppl': '1.693', 'memory/max_active (GiB)': '73.69', 'memory/max_allocated (GiB)': '73.69', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '86.64', 'tokens/total': 1317291136, 'tokens/trainable': 486496416, 'epoch': '1.188'}
 40%|██████████████████████████████████▍                                                    | 694/1751 [11:37:12<17:42:31, 60.31s/it] 40%|██████████████████████████████████▌                                                    | 695/1751 [11:38:11<17:32:47, 59.82s/it]                                                                                                                                     {'loss': '0.5475', 'grad_norm': '0.2021', 'learning_rate': '1.412e-05', 'ppl': '1.729', 'memory/max_active (GiB)': '67.47', 'memory/max_allocated (GiB)': '67.47', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '109.3', 'tokens/total': 1319131904, 'tokens/trainable': 487152416, 'epoch': '1.19'}
 40%|██████████████████████████████████▌                                                    | 695/1751 [11:38:11<17:32:47, 59.82s/it] 40%|██████████████████████████████████▌                                                    | 696/1751 [11:39:13<17:44:18, 60.53s/it]                                                                                                                                     {'loss': '0.4856', 'grad_norm': '0.1904', 'learning_rate': '1.41e-05', 'ppl': '1.625', 'memory/max_active (GiB)': '70.56', 'memory/max_allocated (GiB)': '70.56', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '58', 'tokens/total': 1321126144, 'tokens/trainable': 487864352, 'epoch': '1.192'}
 40%|██████████████████████████████████▌                                                    | 696/1751 [11:39:13<17:44:18, 60.53s/it] 40%|██████████████████████████████████▋                                                    | 697/1751 [11:40:14<17:45:52, 60.68s/it]                                                                                                                                     {'loss': '0.5075', 'grad_norm': '0.1963', 'learning_rate': '1.409e-05', 'ppl': '1.661', 'memory/max_active (GiB)': '75.45', 'memory/max_allocated (GiB)': '75.45', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '111.4', 'tokens/total': 1323040768, 'tokens/trainable': 488568448, 'epoch': '1.194'}
 40%|██████████████████████████████████▋                                                    | 697/1751 [11:40:14<17:45:52, 60.68s/it] 40%|██████████████████████████████████▋                                                    | 698/1751 [11:41:16<17:52:14, 61.10s/it]                                                                                                                                     {'loss': '0.4938', 'grad_norm': '0.1885', 'learning_rate': '1.407e-05', 'ppl': '1.639', 'memory/max_active (GiB)': '75.53', 'memory/max_allocated (GiB)': '75.53', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '39.94', 'tokens/total': 1325033984, 'tokens/trainable': 489312864, 'epoch': '1.195'}
 40%|██████████████████████████████████▋                                                    | 698/1751 [11:41:16<17:52:14, 61.10s/it] 40%|██████████████████████████████████▋                                                    | 699/1751 [11:42:18<17:55:04, 61.32s/it]                                                                                                                                     {'loss': '0.5181', 'grad_norm': '0.1826', 'learning_rate': '1.405e-05', 'ppl': '1.679', 'memory/max_active (GiB)': '74.51', 'memory/max_allocated (GiB)': '74.51', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '26.22', 'tokens/total': 1326999808, 'tokens/trainable': 490044704, 'epoch': '1.197'}
 40%|██████████████████████████████████▋                                                    | 699/1751 [11:42:18<17:55:04, 61.32s/it] 40%|██████████████████████████████████▊                                                    | 700/1751 [11:43:20<17:55:53, 61.42s/it]                                                                                                                                     {'loss': '0.4923', 'grad_norm': '0.1787', 'learning_rate': '1.404e-05', 'ppl': '1.636', 'memory/max_active (GiB)': '74.33', 'memory/max_allocated (GiB)': '74.33', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '85.2', 'tokens/total': 1328945024, 'tokens/trainable': 490771552, 'epoch': '1.199'}
 40%|██████████████████████████████████▊                                                    | 700/1751 [11:43:20<17:55:53, 61.42s/it] 40%|██████████████████████████████████▊                                                    | 701/1751 [11:44:20<17:47:47, 61.02s/it]                                                                                                                                     {'loss': '0.5064', 'grad_norm': '0.2031', 'learning_rate': '1.402e-05', 'ppl': '1.659', 'memory/max_active (GiB)': '76.28', 'memory/max_allocated (GiB)': '76.28', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '72.27', 'tokens/total': 1330837760, 'tokens/trainable': 491451296, 'epoch': '1.2'}
 40%|██████████████████████████████████▊                                                    | 701/1751 [11:44:20<17:47:47, 61.02s/it] 40%|██████████████████████████████████▉                                                    | 702/1751 [11:45:21<17:46:33, 61.00s/it]                                                                                                                                     {'loss': '0.5581', 'grad_norm': '0.209', 'learning_rate': '1.4e-05', 'ppl': '1.747', 'memory/max_active (GiB)': '75.04', 'memory/max_allocated (GiB)': '75.04', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '142.9', 'tokens/total': 1332747008, 'tokens/trainable': 492161952, 'epoch': '1.202'}
 40%|██████████████████████████████████▉                                                    | 702/1751 [11:45:21<17:46:33, 61.00s/it] 40%|██████████████████████████████████▉                                                    | 703/1751 [11:46:24<17:54:09, 61.50s/it]                                                                                                                                     {'loss': '0.4851', 'grad_norm': '0.1836', 'learning_rate': '1.398e-05', 'ppl': '1.624', 'memory/max_active (GiB)': '76.13', 'memory/max_allocated (GiB)': '76.13', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '71.53', 'tokens/total': 1334734976, 'tokens/trainable': 492887296, 'epoch': '1.204'}
 40%|██████████████████████████████████▉                                                    | 703/1751 [11:46:24<17:54:09, 61.50s/it] 40%|██████████████████████████████████▉                                                    | 704/1751 [11:47:20<17:28:43, 60.10s/it]                                                                                                                                     {'loss': '0.5129', 'grad_norm': '0.2041', 'learning_rate': '1.397e-05', 'ppl': '1.67', 'memory/max_active (GiB)': '74.97', 'memory/max_allocated (GiB)': '74.97', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '150.4', 'tokens/total': 1336524544, 'tokens/trainable': 493561664, 'epoch': '1.206'}
 40%|██████████████████████████████████▉                                                    | 704/1751 [11:47:20<17:28:43, 60.10s/it] 40%|███████████████████████████████████                                                    | 705/1751 [11:48:23<17:38:19, 60.71s/it]                                                                                                                                     {'loss': '0.4883', 'grad_norm': '0.1885', 'learning_rate': '1.395e-05', 'ppl': '1.63', 'memory/max_active (GiB)': '71.43', 'memory/max_allocated (GiB)': '71.43', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '94.71', 'tokens/total': 1338461056, 'tokens/trainable': 494267872, 'epoch': '1.207'}
 40%|███████████████████████████████████                                                    | 705/1751 [11:48:23<17:38:19, 60.71s/it] 40%|███████████████████████████████████                                                    | 706/1751 [11:49:23<17:36:24, 60.65s/it]                                                                                                                                     {'loss': '0.5013', 'grad_norm': '0.1758', 'learning_rate': '1.393e-05', 'ppl': '1.651', 'memory/max_active (GiB)': '75.79', 'memory/max_allocated (GiB)': '75.79', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '65.81', 'tokens/total': 1340380160, 'tokens/trainable': 494969440, 'epoch': '1.209'}
 40%|███████████████████████████████████                                                    | 706/1751 [11:49:23<17:36:24, 60.65s/it] 40%|███████████████████████████████████▏                                                   | 707/1751 [11:50:21<17:22:58, 59.94s/it]                                                                                                                                     {'loss': '0.5505', 'grad_norm': '0.2012', 'learning_rate': '1.391e-05', 'ppl': '1.734', 'memory/max_active (GiB)': '74.51', 'memory/max_allocated (GiB)': '74.51', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '72.08', 'tokens/total': 1342231552, 'tokens/trainable': 495640544, 'epoch': '1.211'}
 40%|███████████████████████████████████▏                                                   | 707/1751 [11:50:21<17:22:58, 59.94s/it] 40%|███████████████████████████████████▏                                                   | 708/1751 [11:51:22<17:24:55, 60.11s/it]                                                                                                                                     {'loss': '0.5189', 'grad_norm': '0.1904', 'learning_rate': '1.39e-05', 'ppl': '1.68', 'memory/max_active (GiB)': '76.23', 'memory/max_allocated (GiB)': '76.23', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '64.57', 'tokens/total': 1344147328, 'tokens/trainable': 496332928, 'epoch': '1.212'}
 40%|███████████████████████████████████▏                                                   | 708/1751 [11:51:22<17:24:55, 60.11s/it] 40%|███████████████████████████████████▏                                                   | 709/1751 [11:52:22<17:23:58, 60.11s/it]                                                                                                                                     {'loss': '0.5199', 'grad_norm': '0.1885', 'learning_rate': '1.388e-05', 'ppl': '1.682', 'memory/max_active (GiB)': '76.82', 'memory/max_allocated (GiB)': '76.82', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '114.9', 'tokens/total': 1346024192, 'tokens/trainable': 497027744, 'epoch': '1.214'}
 40%|███████████████████████████████████▏                                                   | 709/1751 [11:52:22<17:23:58, 60.11s/it] 41%|███████████████████████████████████▎                                                   | 710/1751 [11:53:23<17:25:42, 60.27s/it]                                                                                                                                     {'loss': '0.5389', 'grad_norm': '0.2012', 'learning_rate': '1.386e-05', 'ppl': '1.714', 'memory/max_active (GiB)': '76.9', 'memory/max_allocated (GiB)': '76.9', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '81.58', 'tokens/total': 1347915008, 'tokens/trainable': 497745120, 'epoch': '1.216'}
 41%|███████████████████████████████████▎                                                   | 710/1751 [11:53:23<17:25:42, 60.27s/it] 41%|███████████████████████████████████▎                                                   | 711/1751 [11:54:23<17:24:36, 60.27s/it]                                                                                                                                     {'loss': '0.5366', 'grad_norm': '0.1973', 'learning_rate': '1.384e-05', 'ppl': '1.71', 'memory/max_active (GiB)': '71.59', 'memory/max_allocated (GiB)': '71.59', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '31.71', 'tokens/total': 1349793792, 'tokens/trainable': 498457120, 'epoch': '1.218'}
 41%|███████████████████████████████████▎                                                   | 711/1751 [11:54:23<17:24:36, 60.27s/it] 41%|███████████████████████████████████▍                                                   | 712/1751 [11:55:21<17:13:50, 59.70s/it]                                                                                                                                     {'loss': '0.5611', 'grad_norm': '0.1992', 'learning_rate': '1.383e-05', 'ppl': '1.753', 'memory/max_active (GiB)': '73.18', 'memory/max_allocated (GiB)': '73.18', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '129.5', 'tokens/total': 1351617408, 'tokens/trainable': 499097536, 'epoch': '1.219'}
 41%|███████████████████████████████████▍                                                   | 712/1751 [11:55:21<17:13:50, 59.70s/it] 41%|███████████████████████████████████▍                                                   | 713/1751 [11:56:24<17:29:54, 60.69s/it]                                                                                                                                     {'loss': '0.5377', 'grad_norm': '0.2012', 'learning_rate': '1.381e-05', 'ppl': '1.712', 'memory/max_active (GiB)': '76.74', 'memory/max_allocated (GiB)': '76.74', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '222.6', 'tokens/total': 1353644928, 'tokens/trainable': 499859232, 'epoch': '1.221'}
 41%|███████████████████████████████████▍                                                   | 713/1751 [11:56:24<17:29:54, 60.69s/it] 41%|███████████████████████████████████▍                                                   | 714/1751 [11:57:26<17:32:41, 60.91s/it]                                                                                                                                     {'loss': '0.5092', 'grad_norm': '0.1846', 'learning_rate': '1.379e-05', 'ppl': '1.664', 'memory/max_active (GiB)': '76.65', 'memory/max_allocated (GiB)': '76.65', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '99.33', 'tokens/total': 1355623424, 'tokens/trainable': 500589888, 'epoch': '1.223'}
 41%|███████████████████████████████████▍                                                   | 714/1751 [11:57:26<17:32:41, 60.91s/it] 41%|███████████████████████████████████▌                                                   | 715/1751 [11:58:27<17:36:10, 61.17s/it]                                                                                                                                     {'loss': '0.5126', 'grad_norm': '0.1826', 'learning_rate': '1.377e-05', 'ppl': '1.67', 'memory/max_active (GiB)': '74.72', 'memory/max_allocated (GiB)': '74.72', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '145.7', 'tokens/total': 1357603968, 'tokens/trainable': 501299456, 'epoch': '1.224'}
 41%|███████████████████████████████████▌                                                   | 715/1751 [11:58:27<17:36:10, 61.17s/it] 41%|███████████████████████████████████▌                                                   | 716/1751 [11:59:28<17:32:06, 60.99s/it]                                                                                                                                     {'loss': '0.5063', 'grad_norm': '0.2021', 'learning_rate': '1.376e-05', 'ppl': '1.659', 'memory/max_active (GiB)': '72.41', 'memory/max_allocated (GiB)': '72.41', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '72.01', 'tokens/total': 1359528704, 'tokens/trainable': 501984288, 'epoch': '1.226'}
 41%|███████████████████████████████████▌                                                   | 716/1751 [11:59:28<17:32:06, 60.99s/it] 41%|███████████████████████████████████▌                                                   | 717/1751 [12:00:28<17:26:27, 60.72s/it]                                                                                                                                     {'loss': '0.5528', 'grad_norm': '0.1924', 'learning_rate': '1.374e-05', 'ppl': '1.738', 'memory/max_active (GiB)': '71.91', 'memory/max_allocated (GiB)': '71.91', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '83.74', 'tokens/total': 1361401600, 'tokens/trainable': 502738144, 'epoch': '1.228'}
 41%|███████████████████████████████████▌                                                   | 717/1751 [12:00:28<17:26:27, 60.72s/it] 41%|███████████████████████████████████▋                                                   | 718/1751 [12:01:28<17:22:03, 60.53s/it]                                                                                                                                     {'loss': '0.5607', 'grad_norm': '0.1943', 'learning_rate': '1.372e-05', 'ppl': '1.752', 'memory/max_active (GiB)': '70.76', 'memory/max_allocated (GiB)': '70.76', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '192.8', 'tokens/total': 1363234304, 'tokens/trainable': 503456288, 'epoch': '1.23'}
 41%|███████████████████████████████████▋                                                   | 718/1751 [12:01:28<17:22:03, 60.53s/it] 41%|███████████████████████████████████▋                                                   | 719/1751 [12:02:30<17:25:55, 60.81s/it]                                                                                                                                     {'loss': '0.5244', 'grad_norm': '0.208', 'learning_rate': '1.37e-05', 'ppl': '1.69', 'memory/max_active (GiB)': '74.33', 'memory/max_allocated (GiB)': '74.33', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '112.8', 'tokens/total': 1365203072, 'tokens/trainable': 504153344, 'epoch': '1.231'}
 41%|███████████████████████████████████▋                                                   | 719/1751 [12:02:30<17:25:55, 60.81s/it] 41%|███████████████████████████████████▊                                                   | 720/1751 [12:03:32<17:30:21, 61.13s/it]                                                                                                                                     {'loss': '0.5119', 'grad_norm': '0.2012', 'learning_rate': '1.369e-05', 'ppl': '1.668', 'memory/max_active (GiB)': '72.8', 'memory/max_allocated (GiB)': '72.8', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '73.45', 'tokens/total': 1367177344, 'tokens/trainable': 504865728, 'epoch': '1.233'}
 41%|███████████████████████████████████▊                                                   | 720/1751 [12:03:32<17:30:21, 61.13s/it] 41%|███████████████████████████████████▊                                                   | 721/1751 [12:04:33<17:31:58, 61.28s/it]                                                                                                                                     {'loss': '0.5254', 'grad_norm': '0.1934', 'learning_rate': '1.367e-05', 'ppl': '1.691', 'memory/max_active (GiB)': '75.93', 'memory/max_allocated (GiB)': '75.93', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '62.01', 'tokens/total': 1369123328, 'tokens/trainable': 505611904, 'epoch': '1.235'}
 41%|███████████████████████████████████▊                                                   | 721/1751 [12:04:33<17:31:58, 61.28s/it] 41%|███████████████████████████████████▊                                                   | 722/1751 [12:05:33<17:21:33, 60.73s/it]                                                                                                                                     {'loss': '0.5512', 'grad_norm': '0.1973', 'learning_rate': '1.365e-05', 'ppl': '1.735', 'memory/max_active (GiB)': '72.7', 'memory/max_allocated (GiB)': '72.7', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '84.78', 'tokens/total': 1370942848, 'tokens/trainable': 506287392, 'epoch': '1.236'}
 41%|███████████████████████████████████▊                                                   | 722/1751 [12:05:33<17:21:33, 60.73s/it] 41%|███████████████████████████████████▉                                                   | 723/1751 [12:06:35<17:27:20, 61.13s/it]                                                                                                                                     {'loss': '0.5147', 'grad_norm': '0.1816', 'learning_rate': '1.363e-05', 'ppl': '1.673', 'memory/max_active (GiB)': '75.29', 'memory/max_allocated (GiB)': '75.29', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '59.93', 'tokens/total': 1372934912, 'tokens/trainable': 507000416, 'epoch': '1.238'}
 41%|███████████████████████████████████▉                                                   | 723/1751 [12:06:35<17:27:20, 61.13s/it] 41%|███████████████████████████████████▉                                                   | 724/1751 [12:07:37<17:31:21, 61.42s/it]                                                                                                                                     {'loss': '0.4736', 'grad_norm': '0.1807', 'learning_rate': '1.362e-05', 'ppl': '1.606', 'memory/max_active (GiB)': '75.15', 'memory/max_allocated (GiB)': '75.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '67.02', 'tokens/total': 1374894208, 'tokens/trainable': 507707168, 'epoch': '1.24'}
 41%|███████████████████████████████████▉                                                   | 724/1751 [12:07:37<17:31:21, 61.42s/it] 41%|████████████████████████████████████                                                   | 725/1751 [12:08:35<17:16:19, 60.60s/it]                                                                                                                                     {'loss': '0.5122', 'grad_norm': '0.2021', 'learning_rate': '1.36e-05', 'ppl': '1.669', 'memory/max_active (GiB)': '73.33', 'memory/max_allocated (GiB)': '73.33', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '97.92', 'tokens/total': 1376696064, 'tokens/trainable': 508357152, 'epoch': '1.242'}
 41%|████████████████████████████████████                                                   | 725/1751 [12:08:35<17:16:19, 60.60s/it] 41%|████████████████████████████████████                                                   | 726/1751 [12:09:35<17:11:45, 60.40s/it]                                                                                                                                     {'loss': '0.5203', 'grad_norm': '0.1885', 'learning_rate': '1.358e-05', 'ppl': '1.683', 'memory/max_active (GiB)': '76.12', 'memory/max_allocated (GiB)': '76.12', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '145.9', 'tokens/total': 1378567680, 'tokens/trainable': 509057216, 'epoch': '1.243'}
 41%|████████████████████████████████████                                                   | 726/1751 [12:09:35<17:11:45, 60.40s/it] 42%|████████████████████████████████████                                                   | 727/1751 [12:10:38<17:19:31, 60.91s/it]                                                                                                                                     {'loss': '0.5233', 'grad_norm': '0.1797', 'learning_rate': '1.356e-05', 'ppl': '1.688', 'memory/max_active (GiB)': '74.62', 'memory/max_allocated (GiB)': '74.62', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '304.6', 'tokens/total': 1380503936, 'tokens/trainable': 509782272, 'epoch': '1.245'}
 42%|████████████████████████████████████                                                   | 727/1751 [12:10:38<17:19:31, 60.91s/it] 42%|████████████████████████████████████▏                                                  | 728/1751 [12:11:38<17:17:43, 60.86s/it]                                                                                                                                     {'loss': '0.5235', 'grad_norm': '0.1904', 'learning_rate': '1.355e-05', 'ppl': '1.688', 'memory/max_active (GiB)': '75.5', 'memory/max_allocated (GiB)': '75.5', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '29.3', 'tokens/total': 1382371840, 'tokens/trainable': 510477568, 'epoch': '1.247'}
 42%|████████████████████████████████████▏                                                  | 728/1751 [12:11:38<17:17:43, 60.86s/it] 42%|████████████████████████████████████▏                                                  | 729/1751 [12:12:39<17:16:51, 60.87s/it]                                                                                                                                     {'loss': '0.5169', 'grad_norm': '0.1855', 'learning_rate': '1.353e-05', 'ppl': '1.677', 'memory/max_active (GiB)': '76.63', 'memory/max_allocated (GiB)': '76.63', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '147.7', 'tokens/total': 1384301568, 'tokens/trainable': 511181952, 'epoch': '1.248'}
 42%|████████████████████████████████████▏                                                  | 729/1751 [12:12:39<17:16:51, 60.87s/it] 42%|████████████████████████████████████▎                                                  | 730/1751 [12:13:40<17:17:59, 61.00s/it]                                                                                                                                     {'loss': '0.4927', 'grad_norm': '0.1709', 'learning_rate': '1.351e-05', 'ppl': '1.637', 'memory/max_active (GiB)': '72.89', 'memory/max_allocated (GiB)': '72.89', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '32.15', 'tokens/total': 1386202496, 'tokens/trainable': 511907008, 'epoch': '1.25'}
 42%|████████████████████████████████████▎                                                  | 730/1751 [12:13:40<17:17:59, 61.00s/it] 42%|████████████████████████████████████▎                                                  | 731/1751 [12:14:41<17:13:00, 60.76s/it]                                                                                                                                     {'loss': '0.5153', 'grad_norm': '0.1826', 'learning_rate': '1.349e-05', 'ppl': '1.674', 'memory/max_active (GiB)': '72.42', 'memory/max_allocated (GiB)': '72.42', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '112', 'tokens/total': 1388128384, 'tokens/trainable': 512622464, 'epoch': '1.252'}
 42%|████████████████████████████████████▎                                                  | 731/1751 [12:14:41<17:13:00, 60.76s/it] 42%|████████████████████████████████████▎                                                  | 732/1751 [12:15:41<17:10:09, 60.66s/it]                                                                                                                                     {'loss': '0.5148', 'grad_norm': '0.1953', 'learning_rate': '1.348e-05', 'ppl': '1.673', 'memory/max_active (GiB)': '75.73', 'memory/max_allocated (GiB)': '75.73', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '120.4', 'tokens/total': 1390076032, 'tokens/trainable': 513334784, 'epoch': '1.254'}
 42%|████████████████████████████████████▎                                                  | 732/1751 [12:15:41<17:10:09, 60.66s/it] 42%|████████████████████████████████████▍                                                  | 733/1751 [12:16:41<17:04:47, 60.40s/it]                                                                                                                                     {'loss': '0.5056', 'grad_norm': '0.1855', 'learning_rate': '1.346e-05', 'ppl': '1.658', 'memory/max_active (GiB)': '74.12', 'memory/max_allocated (GiB)': '74.12', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '139.9', 'tokens/total': 1392011264, 'tokens/trainable': 514027968, 'epoch': '1.255'}
 42%|████████████████████████████████████▍                                                  | 733/1751 [12:16:41<17:04:47, 60.40s/it] 42%|████████████████████████████████████▍                                                  | 734/1751 [12:17:42<17:09:23, 60.73s/it]                                                                                                                                     {'loss': '0.5656', 'grad_norm': '0.1992', 'learning_rate': '1.344e-05', 'ppl': '1.76', 'memory/max_active (GiB)': '74.81', 'memory/max_allocated (GiB)': '74.81', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '70.46', 'tokens/total': 1394015616, 'tokens/trainable': 514754272, 'epoch': '1.257'}
 42%|████████████████████████████████████▍                                                  | 734/1751 [12:17:42<17:09:23, 60.73s/it] 42%|████████████████████████████████████▌                                                  | 735/1751 [12:18:42<17:04:05, 60.48s/it]                                                                                                                                     {'loss': '0.513', 'grad_norm': '0.1973', 'learning_rate': '1.342e-05', 'ppl': '1.67', 'memory/max_active (GiB)': '74.39', 'memory/max_allocated (GiB)': '74.39', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '85.29', 'tokens/total': 1395954816, 'tokens/trainable': 515460064, 'epoch': '1.259'}
 42%|████████████████████████████████████▌                                                  | 735/1751 [12:18:42<17:04:05, 60.48s/it] 42%|████████████████████████████████████▌                                                  | 736/1751 [12:19:42<16:59:39, 60.28s/it]                                                                                                                                     {'loss': '0.5181', 'grad_norm': '0.1787', 'learning_rate': '1.34e-05', 'ppl': '1.679', 'memory/max_active (GiB)': '77.47', 'memory/max_allocated (GiB)': '77.47', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '87.58', 'tokens/total': 1397853056, 'tokens/trainable': 516159264, 'epoch': '1.26'}
 42%|████████████████████████████████████▌                                                  | 736/1751 [12:19:42<16:59:39, 60.28s/it] 42%|████████████████████████████████████▌                                                  | 737/1751 [12:20:42<16:57:41, 60.22s/it]                                                                                                                                     {'loss': '0.5302', 'grad_norm': '0.208', 'learning_rate': '1.339e-05', 'ppl': '1.699', 'memory/max_active (GiB)': '76.43', 'memory/max_allocated (GiB)': '76.43', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '109.7', 'tokens/total': 1399770112, 'tokens/trainable': 516879296, 'epoch': '1.262'}
 42%|████████████████████████████████████▌                                                  | 737/1751 [12:20:42<16:57:41, 60.22s/it] 42%|████████████████████████████████████▋                                                  | 738/1751 [12:21:42<16:56:11, 60.19s/it]                                                                                                                                     {'loss': '0.4965', 'grad_norm': '0.1816', 'learning_rate': '1.337e-05', 'ppl': '1.643', 'memory/max_active (GiB)': '71.08', 'memory/max_allocated (GiB)': '71.08', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '174', 'tokens/total': 1401699072, 'tokens/trainable': 517572640, 'epoch': '1.264'}
 42%|████████████████████████████████████▋                                                  | 738/1751 [12:21:42<16:56:11, 60.19s/it] 42%|████████████████████████████████████▋                                                  | 739/1751 [12:22:44<17:05:25, 60.80s/it]                                                                                                                                     {'loss': '0.479', 'grad_norm': '0.1777', 'learning_rate': '1.335e-05', 'ppl': '1.614', 'memory/max_active (GiB)': '73.7', 'memory/max_allocated (GiB)': '73.7', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '90.29', 'tokens/total': 1403688960, 'tokens/trainable': 518346400, 'epoch': '1.266'}
 42%|████████████████████████████████████▋                                                  | 739/1751 [12:22:44<17:05:25, 60.80s/it] 42%|████████████████████████████████████▊                                                  | 740/1751 [12:23:44<16:58:28, 60.44s/it]                                                                                                                                     {'loss': '0.5132', 'grad_norm': '0.2002', 'learning_rate': '1.333e-05', 'ppl': '1.671', 'memory/max_active (GiB)': '69.69', 'memory/max_allocated (GiB)': '69.69', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '44.9', 'tokens/total': 1405581440, 'tokens/trainable': 519041824, 'epoch': '1.267'}
 42%|████████████████████████████████████▊                                                  | 740/1751 [12:23:44<16:58:28, 60.44s/it] 42%|████████████████████████████████████▊                                                  | 741/1751 [12:24:44<16:54:30, 60.27s/it]                                                                                                                                     {'loss': '0.5312', 'grad_norm': '0.1855', 'learning_rate': '1.332e-05', 'ppl': '1.701', 'memory/max_active (GiB)': '73.11', 'memory/max_allocated (GiB)': '73.11', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '33.02', 'tokens/total': 1407526144, 'tokens/trainable': 519768608, 'epoch': '1.269'}
 42%|████████████████████████████████████▊                                                  | 741/1751 [12:24:44<16:54:30, 60.27s/it] 42%|████████████████████████████████████▊                                                  | 742/1751 [12:25:43<16:47:57, 59.94s/it]                                                                                                                                     {'loss': '0.5252', 'grad_norm': '0.1875', 'learning_rate': '1.33e-05', 'ppl': '1.691', 'memory/max_active (GiB)': '74.52', 'memory/max_allocated (GiB)': '74.52', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '61.99', 'tokens/total': 1409358208, 'tokens/trainable': 520427680, 'epoch': '1.271'}
 42%|████████████████████████████████████▊                                                  | 742/1751 [12:25:43<16:47:57, 59.94s/it] 42%|████████████████████████████████████▉                                                  | 743/1751 [12:26:46<16:59:30, 60.68s/it]                                                                                                                                     {'loss': '0.53', 'grad_norm': '0.1973', 'learning_rate': '1.328e-05', 'ppl': '1.699', 'memory/max_active (GiB)': '74.66', 'memory/max_allocated (GiB)': '74.66', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '145.5', 'tokens/total': 1411324928, 'tokens/trainable': 521132640, 'epoch': '1.272'}
 42%|████████████████████████████████████▉                                                  | 743/1751 [12:26:46<16:59:30, 60.68s/it] 42%|████████████████████████████████████▉                                                  | 744/1751 [12:27:47<17:00:58, 60.83s/it]                                                                                                                                     {'loss': '0.5266', 'grad_norm': '0.1895', 'learning_rate': '1.326e-05', 'ppl': '1.693', 'memory/max_active (GiB)': '70.07', 'memory/max_allocated (GiB)': '70.07', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '84.08', 'tokens/total': 1413255168, 'tokens/trainable': 521849536, 'epoch': '1.274'}
 42%|████████████████████████████████████▉                                                  | 744/1751 [12:27:47<17:00:58, 60.83s/it] 43%|█████████████████████████████████████                                                  | 745/1751 [12:28:47<16:57:25, 60.68s/it]                                                                                                                                     {'loss': '0.5108', 'grad_norm': '0.1982', 'learning_rate': '1.324e-05', 'ppl': '1.667', 'memory/max_active (GiB)': '77.38', 'memory/max_allocated (GiB)': '77.38', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '84.18', 'tokens/total': 1415181952, 'tokens/trainable': 522536544, 'epoch': '1.276'}
 43%|█████████████████████████████████████                                                  | 745/1751 [12:28:47<16:57:25, 60.68s/it] 43%|█████████████████████████████████████                                                  | 746/1751 [12:29:48<16:59:26, 60.86s/it]                                                                                                                                     {'loss': '0.5132', 'grad_norm': '0.1826', 'learning_rate': '1.323e-05', 'ppl': '1.671', 'memory/max_active (GiB)': '74.75', 'memory/max_allocated (GiB)': '74.75', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '34.13', 'tokens/total': 1417083008, 'tokens/trainable': 523260608, 'epoch': '1.278'}
 43%|█████████████████████████████████████                                                  | 746/1751 [12:29:48<16:59:26, 60.86s/it] 43%|█████████████████████████████████████                                                  | 747/1751 [12:30:48<16:51:44, 60.46s/it]                                                                                                                                     {'loss': '0.5603', 'grad_norm': '0.1943', 'learning_rate': '1.321e-05', 'ppl': '1.751', 'memory/max_active (GiB)': '76.13', 'memory/max_allocated (GiB)': '76.13', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '123.1', 'tokens/total': 1418941312, 'tokens/trainable': 523970592, 'epoch': '1.279'}
 43%|█████████████████████████████████████                                                  | 747/1751 [12:30:48<16:51:44, 60.46s/it] 43%|█████████████████████████████████████▏                                                 | 748/1751 [12:31:49<16:52:31, 60.57s/it]                                                                                                                                     {'loss': '0.5161', 'grad_norm': '0.2061', 'learning_rate': '1.319e-05', 'ppl': '1.676', 'memory/max_active (GiB)': '75.62', 'memory/max_allocated (GiB)': '75.62', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '35.28', 'tokens/total': 1420904192, 'tokens/trainable': 524675296, 'epoch': '1.281'}
 43%|█████████████████████████████████████▏                                                 | 748/1751 [12:31:49<16:52:31, 60.57s/it] 43%|█████████████████████████████████████▏                                                 | 749/1751 [12:32:49<16:47:56, 60.36s/it]                                                                                                                                     {'loss': '0.5541', 'grad_norm': '0.1992', 'learning_rate': '1.317e-05', 'ppl': '1.74', 'memory/max_active (GiB)': '70.81', 'memory/max_allocated (GiB)': '70.81', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '53.26', 'tokens/total': 1422841216, 'tokens/trainable': 525362112, 'epoch': '1.283'}
 43%|█████████████████████████████████████▏                                                 | 749/1751 [12:32:49<16:47:56, 60.36s/it] 43%|█████████████████████████████████████▎                                                 | 750/1751 [12:33:48<16:41:33, 60.03s/it]                                                                                                                                     {'loss': '0.5187', 'grad_norm': '0.1885', 'learning_rate': '1.315e-05', 'ppl': '1.68', 'memory/max_active (GiB)': '75.87', 'memory/max_allocated (GiB)': '75.87', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '80.19', 'tokens/total': 1424709504, 'tokens/trainable': 526041472, 'epoch': '1.284'}
 43%|█████████████████████████████████████▎                                                 | 750/1751 [12:33:48<16:41:33, 60.03s/it] 43%|█████████████████████████████████████▎                                                 | 751/1751 [12:34:45<16:24:21, 59.06s/it]                                                                                                                                     {'loss': '0.5449', 'grad_norm': '0.1895', 'learning_rate': '1.314e-05', 'ppl': '1.724', 'memory/max_active (GiB)': '73.79', 'memory/max_allocated (GiB)': '73.79', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '26.39', 'tokens/total': 1426491392, 'tokens/trainable': 526710368, 'epoch': '1.286'}
 43%|█████████████████████████████████████▎                                                 | 751/1751 [12:34:45<16:24:21, 59.06s/it] 43%|█████████████████████████████████████▎                                                 | 752/1751 [12:35:46<16:36:43, 59.86s/it]                                                                                                                                     {'loss': '0.4904', 'grad_norm': '0.1738', 'learning_rate': '1.312e-05', 'ppl': '1.633', 'memory/max_active (GiB)': '76.21', 'memory/max_allocated (GiB)': '76.21', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '38.94', 'tokens/total': 1428469248, 'tokens/trainable': 527440544, 'epoch': '1.288'}
 43%|█████████████████████████████████████▎                                                 | 752/1751 [12:35:46<16:36:43, 59.86s/it] 43%|█████████████████████████████████████▍                                                 | 753/1751 [12:36:45<16:27:22, 59.36s/it]                                                                                                                                     {'loss': '0.5439', 'grad_norm': '0.1963', 'learning_rate': '1.31e-05', 'ppl': '1.723', 'memory/max_active (GiB)': '71.08', 'memory/max_allocated (GiB)': '71.08', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '152.7', 'tokens/total': 1430285440, 'tokens/trainable': 528111456, 'epoch': '1.29'}
 43%|█████████████████████████████████████▍                                                 | 753/1751 [12:36:45<16:27:22, 59.36s/it] 43%|█████████████████████████████████████▍                                                 | 754/1751 [12:37:47<16:43:09, 60.37s/it]                                                                                                                                     {'loss': '0.4892', 'grad_norm': '0.1758', 'learning_rate': '1.308e-05', 'ppl': '1.631', 'memory/max_active (GiB)': '72.09', 'memory/max_allocated (GiB)': '72.09', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '45.54', 'tokens/total': 1432288384, 'tokens/trainable': 528869184, 'epoch': '1.291'}
 43%|█████████████████████████████████████▍                                                 | 754/1751 [12:37:47<16:43:09, 60.37s/it] 43%|█████████████████████████████████████▌                                                 | 755/1751 [12:38:48<16:42:15, 60.38s/it]                                                                                                                                     {'loss': '0.5321', 'grad_norm': '0.1963', 'learning_rate': '1.307e-05', 'ppl': '1.702', 'memory/max_active (GiB)': '73.97', 'memory/max_allocated (GiB)': '73.97', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '131.5', 'tokens/total': 1434231552, 'tokens/trainable': 529573696, 'epoch': '1.293'}
 43%|█████████████████████████████████████▌                                                 | 755/1751 [12:38:48<16:42:15, 60.38s/it] 43%|█████████████████████████████████████▌                                                 | 756/1751 [12:39:47<16:35:00, 60.00s/it]                                                                                                                                     {'loss': '0.5145', 'grad_norm': '0.1895', 'learning_rate': '1.305e-05', 'ppl': '1.673', 'memory/max_active (GiB)': '69.43', 'memory/max_allocated (GiB)': '69.43', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '93.52', 'tokens/total': 1436101760, 'tokens/trainable': 530309888, 'epoch': '1.295'}
 43%|█████████████████████████████████████▌                                                 | 756/1751 [12:39:47<16:35:00, 60.00s/it] 43%|█████████████████████████████████████▌                                                 | 757/1751 [12:40:46<16:28:58, 59.70s/it]                                                                                                                                     {'loss': '0.5343', 'grad_norm': '0.2061', 'learning_rate': '1.303e-05', 'ppl': '1.706', 'memory/max_active (GiB)': '75.05', 'memory/max_allocated (GiB)': '75.05', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '206.1', 'tokens/total': 1437988224, 'tokens/trainable': 530983584, 'epoch': '1.296'}
 43%|█████████████████████████████████████▌                                                 | 757/1751 [12:40:46<16:28:58, 59.70s/it] 43%|█████████████████████████████████████▋                                                 | 758/1751 [12:41:46<16:31:09, 59.89s/it]                                                                                                                                     {'loss': '0.5383', 'grad_norm': '0.1875', 'learning_rate': '1.301e-05', 'ppl': '1.713', 'memory/max_active (GiB)': '73.58', 'memory/max_allocated (GiB)': '73.58', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '45.42', 'tokens/total': 1439917440, 'tokens/trainable': 531708896, 'epoch': '1.298'}
 43%|█████████████████████████████████████▋                                                 | 758/1751 [12:41:46<16:31:09, 59.89s/it] 43%|█████████████████████████████████████▋                                                 | 759/1751 [12:42:46<16:28:41, 59.80s/it]                                                                                                                                     {'loss': '0.5338', 'grad_norm': '0.1943', 'learning_rate': '1.299e-05', 'ppl': '1.705', 'memory/max_active (GiB)': '69.94', 'memory/max_allocated (GiB)': '69.94', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '163.8', 'tokens/total': 1441747840, 'tokens/trainable': 532393120, 'epoch': '1.3'}
 43%|█████████████████████████████████████▋                                                 | 759/1751 [12:42:46<16:28:41, 59.80s/it] 43%|█████████████████████████████████████▊                                                 | 760/1751 [12:43:47<16:33:09, 60.13s/it]                                                                                                                                     {'loss': '0.5434', 'grad_norm': '0.1924', 'learning_rate': '1.298e-05', 'ppl': '1.722', 'memory/max_active (GiB)': '74.36', 'memory/max_allocated (GiB)': '74.36', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '51.69', 'tokens/total': 1443621120, 'tokens/trainable': 533060384, 'epoch': '1.302'}
 43%|█████████████████████████████████████▊                                                 | 760/1751 [12:43:47<16:33:09, 60.13s/it] 43%|█████████████████████████████████████▊                                                 | 761/1751 [12:44:48<16:39:34, 60.58s/it]                                                                                                                                     {'loss': '0.5135', 'grad_norm': '0.1875', 'learning_rate': '1.296e-05', 'ppl': '1.671', 'memory/max_active (GiB)': '76.16', 'memory/max_allocated (GiB)': '76.16', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '143.4', 'tokens/total': 1445576832, 'tokens/trainable': 533800544, 'epoch': '1.303'}
 43%|█████████████████████████████████████▊                                                 | 761/1751 [12:44:48<16:39:34, 60.58s/it] 44%|█████████████████████████████████████▊                                                 | 762/1751 [12:45:50<16:43:25, 60.88s/it]                                                                                                                                     {'loss': '0.5009', 'grad_norm': '0.1797', 'learning_rate': '1.294e-05', 'ppl': '1.65', 'memory/max_active (GiB)': '74.83', 'memory/max_allocated (GiB)': '74.83', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '84.59', 'tokens/total': 1447528448, 'tokens/trainable': 534546592, 'epoch': '1.305'}
 44%|█████████████████████████████████████▊                                                 | 762/1751 [12:45:50<16:43:25, 60.88s/it] 44%|█████████████████████████████████████▉                                                 | 763/1751 [12:46:51<16:44:03, 60.97s/it]                                                                                                                                     {'loss': '0.4943', 'grad_norm': '0.1768', 'learning_rate': '1.292e-05', 'ppl': '1.639', 'memory/max_active (GiB)': '77.91', 'memory/max_allocated (GiB)': '77.91', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '40.06', 'tokens/total': 1449460864, 'tokens/trainable': 535267296, 'epoch': '1.307'}
 44%|█████████████████████████████████████▉                                                 | 763/1751 [12:46:51<16:44:03, 60.97s/it] 44%|█████████████████████████████████████▉                                                 | 764/1751 [12:47:52<16:41:26, 60.88s/it]                                                                                                                                     {'loss': '0.4871', 'grad_norm': '0.1953', 'learning_rate': '1.29e-05', 'ppl': '1.628', 'memory/max_active (GiB)': '72.91', 'memory/max_allocated (GiB)': '72.91', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '100.3', 'tokens/total': 1451361408, 'tokens/trainable': 535950336, 'epoch': '1.308'}
 44%|█████████████████████████████████████▉                                                 | 764/1751 [12:47:52<16:41:26, 60.88s/it] 44%|██████████████████████████████████████                                                 | 765/1751 [12:48:52<16:39:50, 60.84s/it]                                                                                                                                     {'loss': '0.5333', 'grad_norm': '0.2031', 'learning_rate': '1.288e-05', 'ppl': '1.705', 'memory/max_active (GiB)': '75.88', 'memory/max_allocated (GiB)': '75.88', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '96.94', 'tokens/total': 1453258496, 'tokens/trainable': 536654688, 'epoch': '1.31'}
 44%|██████████████████████████████████████                                                 | 765/1751 [12:48:52<16:39:50, 60.84s/it] 44%|██████████████████████████████████████                                                 | 766/1751 [12:49:54<16:43:18, 61.12s/it]                                                                                                                                     {'loss': '0.5119', 'grad_norm': '0.1865', 'learning_rate': '1.287e-05', 'ppl': '1.668', 'memory/max_active (GiB)': '77.16', 'memory/max_allocated (GiB)': '77.16', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '40.36', 'tokens/total': 1455196160, 'tokens/trainable': 537375616, 'epoch': '1.312'}
 44%|██████████████████████████████████████                                                 | 766/1751 [12:49:54<16:43:18, 61.12s/it] 44%|██████████████████████████████████████                                                 | 767/1751 [12:50:55<16:39:07, 60.92s/it]                                                                                                                                     {'loss': '0.51', 'grad_norm': '0.1924', 'learning_rate': '1.285e-05', 'ppl': '1.665', 'memory/max_active (GiB)': '73.98', 'memory/max_allocated (GiB)': '73.98', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '74.82', 'tokens/total': 1457095680, 'tokens/trainable': 538050880, 'epoch': '1.314'}
 44%|██████████████████████████████████████                                                 | 767/1751 [12:50:55<16:39:07, 60.92s/it] 44%|██████████████████████████████████████▏                                                | 768/1751 [12:51:52<16:19:00, 59.76s/it]                                                                                                                                     {'loss': '0.5088', 'grad_norm': '0.1904', 'learning_rate': '1.283e-05', 'ppl': '1.663', 'memory/max_active (GiB)': '74.77', 'memory/max_allocated (GiB)': '74.77', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '80.35', 'tokens/total': 1458886272, 'tokens/trainable': 538687104, 'epoch': '1.315'}
 44%|██████████████████████████████████████▏                                                | 768/1751 [12:51:52<16:19:00, 59.76s/it] 44%|██████████████████████████████████████▏                                                | 769/1751 [12:52:52<16:19:04, 59.82s/it]                                                                                                                                     {'loss': '0.5256', 'grad_norm': '0.1895', 'learning_rate': '1.281e-05', 'ppl': '1.692', 'memory/max_active (GiB)': '72.24', 'memory/max_allocated (GiB)': '72.24', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '49', 'tokens/total': 1460794240, 'tokens/trainable': 539412992, 'epoch': '1.317'}
 44%|██████████████████████████████████████▏                                                | 769/1751 [12:52:52<16:19:04, 59.82s/it] 44%|██████████████████████████████████████▎                                                | 770/1751 [12:53:52<16:20:46, 59.99s/it]                                                                                                                                     {'loss': '0.5015', 'grad_norm': '0.1826', 'learning_rate': '1.279e-05', 'ppl': '1.651', 'memory/max_active (GiB)': '73.45', 'memory/max_allocated (GiB)': '73.45', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '86.75', 'tokens/total': 1462699136, 'tokens/trainable': 540159296, 'epoch': '1.319'}
 44%|██████████████████████████████████████▎                                                | 770/1751 [12:53:52<16:20:46, 59.99s/it] 44%|██████████████████████████████████████▎                                                | 771/1751 [12:54:52<16:17:58, 59.88s/it]                                                                                                                                     {'loss': '0.5297', 'grad_norm': '0.1816', 'learning_rate': '1.278e-05', 'ppl': '1.698', 'memory/max_active (GiB)': '71.89', 'memory/max_allocated (GiB)': '71.89', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '99.2', 'tokens/total': 1464565760, 'tokens/trainable': 540863232, 'epoch': '1.32'}
 44%|██████████████████████████████████████▎                                                | 771/1751 [12:54:52<16:17:58, 59.88s/it] 44%|██████████████████████████████████████▎                                                | 772/1751 [12:55:53<16:23:10, 60.26s/it]                                                                                                                                     {'loss': '0.5316', 'grad_norm': '0.1836', 'learning_rate': '1.276e-05', 'ppl': '1.702', 'memory/max_active (GiB)': '77.19', 'memory/max_allocated (GiB)': '77.19', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '93.09', 'tokens/total': 1466492544, 'tokens/trainable': 541605760, 'epoch': '1.322'}
 44%|██████████████████████████████████████▎                                                | 772/1751 [12:55:53<16:23:10, 60.26s/it] 44%|██████████████████████████████████████▍                                                | 773/1751 [12:56:58<16:44:25, 61.62s/it]                                                                                                                                     {'loss': '0.5029', 'grad_norm': '0.1748', 'learning_rate': '1.274e-05', 'ppl': '1.653', 'memory/max_active (GiB)': '75.82', 'memory/max_allocated (GiB)': '75.82', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '120.4', 'tokens/total': 1468604800, 'tokens/trainable': 542396672, 'epoch': '1.324'}
 44%|██████████████████████████████████████▍                                                | 773/1751 [12:56:58<16:44:25, 61.62s/it] 44%|██████████████████████████████████████▍                                                | 774/1751 [12:57:57<16:31:03, 60.86s/it]                                                                                                                                     {'loss': '0.4966', 'grad_norm': '0.1943', 'learning_rate': '1.272e-05', 'ppl': '1.643', 'memory/max_active (GiB)': '73.38', 'memory/max_allocated (GiB)': '73.38', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '15.65', 'tokens/total': 1470457600, 'tokens/trainable': 543089920, 'epoch': '1.326'}
 44%|██████████████████████████████████████▍                                                | 774/1751 [12:57:57<16:31:03, 60.86s/it] 44%|██████████████████████████████████████▌                                                | 775/1751 [12:58:56<16:23:17, 60.45s/it]                                                                                                                                     {'loss': '0.5757', 'grad_norm': '0.1953', 'learning_rate': '1.27e-05', 'ppl': '1.778', 'memory/max_active (GiB)': '76.08', 'memory/max_allocated (GiB)': '76.08', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '237', 'tokens/total': 1472315520, 'tokens/trainable': 543824128, 'epoch': '1.327'}
 44%|██████████████████████████████████████▌                                                | 775/1751 [12:58:56<16:23:17, 60.45s/it] 44%|██████████████████████████████████████▌                                                | 776/1751 [13:00:00<16:39:29, 61.51s/it]                                                                                                                                     {'loss': '0.494', 'grad_norm': '0.1768', 'learning_rate': '1.269e-05', 'ppl': '1.639', 'memory/max_active (GiB)': '76.03', 'memory/max_allocated (GiB)': '76.03', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '132.5', 'tokens/total': 1474367744, 'tokens/trainable': 544552704, 'epoch': '1.329'}
 44%|██████████████████████████████████████▌                                                | 776/1751 [13:00:00<16:39:29, 61.51s/it] 44%|██████████████████████████████████████▌                                                | 777/1751 [13:01:02<16:41:46, 61.71s/it]                                                                                                                                     {'loss': '0.5041', 'grad_norm': '0.1689', 'learning_rate': '1.267e-05', 'ppl': '1.655', 'memory/max_active (GiB)': '73.37', 'memory/max_allocated (GiB)': '73.37', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '41.49', 'tokens/total': 1476299520, 'tokens/trainable': 545307648, 'epoch': '1.331'}
 44%|██████████████████████████████████████▌                                                | 777/1751 [13:01:02<16:41:46, 61.71s/it] 44%|██████████████████████████████████████▋                                                | 778/1751 [13:02:03<16:37:01, 61.48s/it]                                                                                                                                     {'loss': '0.5054', 'grad_norm': '0.1953', 'learning_rate': '1.265e-05', 'ppl': '1.658', 'memory/max_active (GiB)': '77.07', 'memory/max_allocated (GiB)': '77.07', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '14.88', 'tokens/total': 1478192512, 'tokens/trainable': 546013632, 'epoch': '1.332'}
 44%|██████████████████████████████████████▋                                                | 778/1751 [13:02:03<16:37:01, 61.48s/it] 44%|██████████████████████████████████████▋                                                | 779/1751 [13:03:04<16:33:13, 61.31s/it]                                                                                                                                     {'loss': '0.5396', 'grad_norm': '0.1973', 'learning_rate': '1.263e-05', 'ppl': '1.715', 'memory/max_active (GiB)': '70.9', 'memory/max_allocated (GiB)': '70.9', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '130.2', 'tokens/total': 1480135680, 'tokens/trainable': 546728000, 'epoch': '1.334'}
 44%|██████████████████████████████████████▋                                                | 779/1751 [13:03:04<16:33:13, 61.31s/it] 45%|██████████████████████████████████████▊                                                | 780/1751 [13:04:04<16:27:02, 60.99s/it]                                                                                                                                     {'loss': '0.5593', 'grad_norm': '0.1875', 'learning_rate': '1.261e-05', 'ppl': '1.75', 'memory/max_active (GiB)': '75.48', 'memory/max_allocated (GiB)': '75.48', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '119.9', 'tokens/total': 1481993728, 'tokens/trainable': 547441280, 'epoch': '1.336'}
 45%|██████████████████████████████████████▊                                                | 780/1751 [13:04:04<16:27:02, 60.99s/it] 45%|██████████████████████████████████████▊                                                | 781/1751 [13:05:02<16:10:12, 60.01s/it]                                                                                                                                     {'loss': '0.5468', 'grad_norm': '0.2002', 'learning_rate': '1.259e-05', 'ppl': '1.728', 'memory/max_active (GiB)': '72.46', 'memory/max_allocated (GiB)': '72.46', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '158.6', 'tokens/total': 1483781888, 'tokens/trainable': 548080448, 'epoch': '1.338'}
 45%|██████████████████████████████████████▊                                                | 781/1751 [13:05:02<16:10:12, 60.01s/it] 45%|██████████████████████████████████████▊                                                | 782/1751 [13:06:01<16:04:14, 59.71s/it]                                                                                                                                     {'loss': '0.5518', 'grad_norm': '0.2002', 'learning_rate': '1.258e-05', 'ppl': '1.736', 'memory/max_active (GiB)': '69.98', 'memory/max_allocated (GiB)': '69.98', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '84.38', 'tokens/total': 1485656320, 'tokens/trainable': 548785600, 'epoch': '1.339'}
 45%|██████████████████████████████████████▊                                                | 782/1751 [13:06:01<16:04:14, 59.71s/it] 45%|██████████████████████████████████████▉                                                | 783/1751 [13:07:01<16:05:35, 59.85s/it]                                                                                                                                     {'loss': '0.5015', 'grad_norm': '0.1885', 'learning_rate': '1.256e-05', 'ppl': '1.651', 'memory/max_active (GiB)': '76.02', 'memory/max_allocated (GiB)': '76.02', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '119.2', 'tokens/total': 1487497472, 'tokens/trainable': 549466112, 'epoch': '1.341'}
 45%|██████████████████████████████████████▉                                                | 783/1751 [13:07:01<16:05:35, 59.85s/it] 45%|██████████████████████████████████████▉                                                | 784/1751 [13:08:03<16:15:18, 60.52s/it]                                                                                                                                     {'loss': '0.5117', 'grad_norm': '0.2041', 'learning_rate': '1.254e-05', 'ppl': '1.668', 'memory/max_active (GiB)': '73.57', 'memory/max_allocated (GiB)': '73.57', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '125.3', 'tokens/total': 1489502848, 'tokens/trainable': 550255296, 'epoch': '1.343'}
 45%|██████████████████████████████████████▉                                                | 784/1751 [13:08:03<16:15:18, 60.52s/it] 45%|███████████████████████████████████████                                                | 785/1751 [13:09:06<16:21:57, 60.99s/it]                                                                                                                                     {'loss': '0.4949', 'grad_norm': '0.1846', 'learning_rate': '1.252e-05', 'ppl': '1.64', 'memory/max_active (GiB)': '74.12', 'memory/max_allocated (GiB)': '74.12', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '151.2', 'tokens/total': 1491490176, 'tokens/trainable': 551009472, 'epoch': '1.344'}
 45%|███████████████████████████████████████                                                | 785/1751 [13:09:06<16:21:57, 60.99s/it] 45%|███████████████████████████████████████                                                | 786/1751 [13:10:06<16:16:33, 60.72s/it]                                                                                                                                     {'loss': '0.5182', 'grad_norm': '0.1865', 'learning_rate': '1.25e-05', 'ppl': '1.679', 'memory/max_active (GiB)': '76.92', 'memory/max_allocated (GiB)': '76.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '27.78', 'tokens/total': 1493385344, 'tokens/trainable': 551710848, 'epoch': '1.346'}
 45%|███████████████████████████████████████                                                | 786/1751 [13:10:06<16:16:33, 60.72s/it] 45%|███████████████████████████████████████                                                | 787/1751 [13:11:08<16:25:35, 61.34s/it]                                                                                                                                     {'loss': '0.5098', 'grad_norm': '0.1797', 'learning_rate': '1.248e-05', 'ppl': '1.665', 'memory/max_active (GiB)': '74.68', 'memory/max_allocated (GiB)': '74.68', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '180.3', 'tokens/total': 1495331456, 'tokens/trainable': 552459008, 'epoch': '1.348'}
 45%|███████████████████████████████████████                                                | 787/1751 [13:11:08<16:25:35, 61.34s/it] 45%|███████████████████████████████████████▏                                               | 788/1751 [13:12:10<16:26:06, 61.44s/it]                                                                                                                                     {'loss': '0.5112', 'grad_norm': '0.2012', 'learning_rate': '1.247e-05', 'ppl': '1.667', 'memory/max_active (GiB)': '74.01', 'memory/max_allocated (GiB)': '74.01', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '111', 'tokens/total': 1497266432, 'tokens/trainable': 553151040, 'epoch': '1.35'}
 45%|███████████████████████████████████████▏                                               | 788/1751 [13:12:10<16:26:06, 61.44s/it] 45%|███████████████████████████████████████▏                                               | 789/1751 [13:13:09<16:13:10, 60.70s/it]                                                                                                                                     {'loss': '0.5279', 'grad_norm': '0.1895', 'learning_rate': '1.245e-05', 'ppl': '1.695', 'memory/max_active (GiB)': '69.29', 'memory/max_allocated (GiB)': '69.29', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '148.7', 'tokens/total': 1499123712, 'tokens/trainable': 553845120, 'epoch': '1.351'}
 45%|███████████████████████████████████████▏                                               | 789/1751 [13:13:09<16:13:10, 60.70s/it] 45%|███████████████████████████████████████▎                                               | 790/1751 [13:14:09<16:06:27, 60.34s/it]                                                                                                                                     {'loss': '0.5405', 'grad_norm': '0.1914', 'learning_rate': '1.243e-05', 'ppl': '1.717', 'memory/max_active (GiB)': '68.07', 'memory/max_allocated (GiB)': '68.07', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '28.01', 'tokens/total': 1501021568, 'tokens/trainable': 554549248, 'epoch': '1.353'}
 45%|███████████████████████████████████████▎                                               | 790/1751 [13:14:09<16:06:27, 60.34s/it] 45%|███████████████████████████████████████▎                                               | 791/1751 [13:15:10<16:09:47, 60.61s/it]                                                                                                                                     {'loss': '0.5533', 'grad_norm': '0.1982', 'learning_rate': '1.241e-05', 'ppl': '1.739', 'memory/max_active (GiB)': '75.33', 'memory/max_allocated (GiB)': '75.33', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '68.91', 'tokens/total': 1502921856, 'tokens/trainable': 555273792, 'epoch': '1.355'}
 45%|███████████████████████████████████████▎                                               | 791/1751 [13:15:10<16:09:47, 60.61s/it] 45%|███████████████████████████████████████▎                                               | 792/1751 [13:16:11<16:11:11, 60.76s/it]                                                                                                                                     {'loss': '0.5364', 'grad_norm': '0.1934', 'learning_rate': '1.239e-05', 'ppl': '1.71', 'memory/max_active (GiB)': '76.03', 'memory/max_allocated (GiB)': '76.03', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '64.06', 'tokens/total': 1504816256, 'tokens/trainable': 555957888, 'epoch': '1.356'}
 45%|███████████████████████████████████████▎                                               | 792/1751 [13:16:11<16:11:11, 60.76s/it] 45%|███████████████████████████████████████▍                                               | 793/1751 [13:17:12<16:12:07, 60.89s/it]                                                                                                                                     {'loss': '0.5342', 'grad_norm': '0.1963', 'learning_rate': '1.237e-05', 'ppl': '1.706', 'memory/max_active (GiB)': '74.5', 'memory/max_allocated (GiB)': '74.5', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '109.9', 'tokens/total': 1506751488, 'tokens/trainable': 556671680, 'epoch': '1.358'}
 45%|███████████████████████████████████████▍                                               | 793/1751 [13:17:12<16:12:07, 60.89s/it] 45%|███████████████████████████████████████▍                                               | 794/1751 [13:18:11<16:02:56, 60.37s/it]                                                                                                                                     {'loss': '0.5137', 'grad_norm': '0.1895', 'learning_rate': '1.236e-05', 'ppl': '1.671', 'memory/max_active (GiB)': '74.89', 'memory/max_allocated (GiB)': '74.89', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '57.43', 'tokens/total': 1508614656, 'tokens/trainable': 557332672, 'epoch': '1.36'}
 45%|███████████████████████████████████████▍                                               | 794/1751 [13:18:11<16:02:56, 60.37s/it] 45%|███████████████████████████████████████▌                                               | 795/1751 [13:19:09<15:51:32, 59.72s/it]                                                                                                                                     {'loss': '0.5302', 'grad_norm': '0.2002', 'learning_rate': '1.234e-05', 'ppl': '1.699', 'memory/max_active (GiB)': '70.92', 'memory/max_allocated (GiB)': '70.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '122.2', 'tokens/total': 1510420864, 'tokens/trainable': 558004736, 'epoch': '1.362'}
 45%|███████████████████████████████████████▌                                               | 795/1751 [13:19:09<15:51:32, 59.72s/it] 45%|███████████████████████████████████████▌                                               | 796/1751 [13:20:10<15:55:00, 60.00s/it]                                                                                                                                     {'loss': '0.5185', 'grad_norm': '0.1719', 'learning_rate': '1.232e-05', 'ppl': '1.679', 'memory/max_active (GiB)': '73.48', 'memory/max_allocated (GiB)': '73.48', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '57.96', 'tokens/total': 1512317696, 'tokens/trainable': 558753536, 'epoch': '1.363'}
 45%|███████████████████████████████████████▌                                               | 796/1751 [13:20:10<15:55:00, 60.00s/it] 46%|███████████████████████████████████████▌                                               | 797/1751 [13:21:10<15:51:23, 59.84s/it]                                                                                                                                     {'loss': '0.5136', 'grad_norm': '0.1855', 'learning_rate': '1.23e-05', 'ppl': '1.671', 'memory/max_active (GiB)': '73.15', 'memory/max_allocated (GiB)': '73.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '83.31', 'tokens/total': 1514218240, 'tokens/trainable': 559461632, 'epoch': '1.365'}
 46%|███████████████████████████████████████▌                                               | 797/1751 [13:21:10<15:51:23, 59.84s/it] 46%|███████████████████████████████████████▋                                               | 798/1751 [13:22:07<15:39:23, 59.14s/it]                                                                                                                                     {'loss': '0.5605', 'grad_norm': '0.2021', 'learning_rate': '1.228e-05', 'ppl': '1.751', 'memory/max_active (GiB)': '71.27', 'memory/max_allocated (GiB)': '71.27', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '70.65', 'tokens/total': 1516022784, 'tokens/trainable': 560143104, 'epoch': '1.367'}
 46%|███████████████████████████████████████▋                                               | 798/1751 [13:22:07<15:39:23, 59.14s/it] 46%|███████████████████████████████████████▋                                               | 799/1751 [13:23:09<15:53:12, 60.08s/it]                                                                                                                                     {'loss': '0.5135', 'grad_norm': '0.1885', 'learning_rate': '1.226e-05', 'ppl': '1.671', 'memory/max_active (GiB)': '71.74', 'memory/max_allocated (GiB)': '71.74', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '84.68', 'tokens/total': 1517963520, 'tokens/trainable': 560878144, 'epoch': '1.368'}
 46%|███████████████████████████████████████▋                                               | 799/1751 [13:23:09<15:53:12, 60.08s/it] 46%|███████████████████████████████████████▋                                               | 800/1751 [13:24:07<15:42:49, 59.48s/it]                                                                                                                                     {'loss': '0.5746', 'grad_norm': '0.1982', 'learning_rate': '1.225e-05', 'ppl': '1.776', 'memory/max_active (GiB)': '73.19', 'memory/max_allocated (GiB)': '73.19', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '149.8', 'tokens/total': 1519721472, 'tokens/trainable': 561531200, 'epoch': '1.37'}
 46%|███████████████████████████████████████▋                                               | 800/1751 [13:24:07<15:42:49, 59.48s/it] 46%|███████████████████████████████████████▊                                               | 801/1751 [13:25:07<15:41:05, 59.44s/it]                                                                                                                                     {'loss': '0.5217', 'grad_norm': '0.1992', 'learning_rate': '1.223e-05', 'ppl': '1.685', 'memory/max_active (GiB)': '72.87', 'memory/max_allocated (GiB)': '72.87', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '80.16', 'tokens/total': 1521571072, 'tokens/trainable': 562215744, 'epoch': '1.372'}
 46%|███████████████████████████████████████▊                                               | 801/1751 [13:25:07<15:41:05, 59.44s/it] 46%|███████████████████████████████████████▊                                               | 802/1751 [13:26:08<15:47:34, 59.91s/it]                                                                                                                                     {'loss': '0.523', 'grad_norm': '0.1826', 'learning_rate': '1.221e-05', 'ppl': '1.687', 'memory/max_active (GiB)': '71.39', 'memory/max_allocated (GiB)': '71.39', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '82.44', 'tokens/total': 1523516032, 'tokens/trainable': 562944448, 'epoch': '1.374'}
 46%|███████████████████████████████████████▊                                               | 802/1751 [13:26:08<15:47:34, 59.91s/it] 46%|███████████████████████████████████████▉                                               | 803/1751 [13:27:07<15:41:02, 59.56s/it]                                                                                                                                     {'loss': '0.5465', 'grad_norm': '0.1855', 'learning_rate': '1.219e-05', 'ppl': '1.727', 'memory/max_active (GiB)': '71.81', 'memory/max_allocated (GiB)': '71.81', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '30.82', 'tokens/total': 1525362304, 'tokens/trainable': 563632896, 'epoch': '1.375'}
 46%|███████████████████████████████████████▉                                               | 803/1751 [13:27:07<15:41:02, 59.56s/it] 46%|███████████████████████████████████████▉                                               | 804/1751 [13:28:07<15:42:45, 59.73s/it]                                                                                                                                     {'loss': '0.5171', 'grad_norm': '0.1846', 'learning_rate': '1.217e-05', 'ppl': '1.677', 'memory/max_active (GiB)': '74.64', 'memory/max_allocated (GiB)': '74.64', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '110.9', 'tokens/total': 1527218176, 'tokens/trainable': 564338560, 'epoch': '1.377'}
 46%|███████████████████████████████████████▉                                               | 804/1751 [13:28:07<15:42:45, 59.73s/it] 46%|███████████████████████████████████████▉                                               | 805/1751 [13:29:09<15:52:24, 60.41s/it]                                                                                                                                     {'loss': '0.5075', 'grad_norm': '0.1855', 'learning_rate': '1.215e-05', 'ppl': '1.661', 'memory/max_active (GiB)': '76.65', 'memory/max_allocated (GiB)': '76.65', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '79.35', 'tokens/total': 1529205120, 'tokens/trainable': 565059328, 'epoch': '1.379'}
 46%|███████████████████████████████████████▉                                               | 805/1751 [13:29:09<15:52:24, 60.41s/it] 46%|████████████████████████████████████████                                               | 806/1751 [13:30:09<15:49:23, 60.28s/it]                                                                                                                                     {'loss': '0.5761', 'grad_norm': '0.2002', 'learning_rate': '1.214e-05', 'ppl': '1.779', 'memory/max_active (GiB)': '75.25', 'memory/max_allocated (GiB)': '75.25', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '87.2', 'tokens/total': 1531071744, 'tokens/trainable': 565737152, 'epoch': '1.38'}
 46%|████████████████████████████████████████                                               | 806/1751 [13:30:09<15:49:23, 60.28s/it] 46%|████████████████████████████████████████                                               | 807/1751 [13:31:09<15:47:56, 60.25s/it]                                                                                                                                     {'loss': '0.5324', 'grad_norm': '0.1973', 'learning_rate': '1.212e-05', 'ppl': '1.703', 'memory/max_active (GiB)': '75.15', 'memory/max_allocated (GiB)': '75.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '84.73', 'tokens/total': 1532964864, 'tokens/trainable': 566418560, 'epoch': '1.382'}
 46%|████████████████████████████████████████                                               | 807/1751 [13:31:09<15:47:56, 60.25s/it] 46%|████████████████████████████████████████▏                                              | 808/1751 [13:32:12<16:01:51, 61.20s/it]                                                                                                                                     {'loss': '0.5328', 'grad_norm': '0.2002', 'learning_rate': '1.21e-05', 'ppl': '1.704', 'memory/max_active (GiB)': '71.44', 'memory/max_allocated (GiB)': '71.44', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '46.62', 'tokens/total': 1534953344, 'tokens/trainable': 567127424, 'epoch': '1.384'}
 46%|████████████████████████████████████████▏                                              | 808/1751 [13:32:12<16:01:51, 61.20s/it] 46%|████████████████████████████████████████▏                                              | 809/1751 [13:33:12<15:54:17, 60.78s/it]                                                                                                                                     {'loss': '0.5135', 'grad_norm': '0.1904', 'learning_rate': '1.208e-05', 'ppl': '1.671', 'memory/max_active (GiB)': '76.05', 'memory/max_allocated (GiB)': '76.05', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '111.1', 'tokens/total': 1536815616, 'tokens/trainable': 567827520, 'epoch': '1.386'}
 46%|████████████████████████████████████████▏                                              | 809/1751 [13:33:12<15:54:17, 60.78s/it] 46%|████████████████████████████████████████▏                                              | 810/1751 [13:34:10<15:41:50, 60.05s/it]                                                                                                                                     {'loss': '0.5445', 'grad_norm': '0.1924', 'learning_rate': '1.206e-05', 'ppl': '1.724', 'memory/max_active (GiB)': '74.47', 'memory/max_allocated (GiB)': '74.47', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '104.2', 'tokens/total': 1538644736, 'tokens/trainable': 568511232, 'epoch': '1.387'}
 46%|████████████████████████████████████████▏                                              | 810/1751 [13:34:10<15:41:50, 60.05s/it] 46%|████████████████████████████████████████▎                                              | 811/1751 [13:35:10<15:39:20, 59.96s/it]                                                                                                                                     {'loss': '0.5403', 'grad_norm': '0.1924', 'learning_rate': '1.204e-05', 'ppl': '1.716', 'memory/max_active (GiB)': '69.57', 'memory/max_allocated (GiB)': '69.57', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '70.16', 'tokens/total': 1540482944, 'tokens/trainable': 569219904, 'epoch': '1.389'}
 46%|████████████████████████████████████████▎                                              | 811/1751 [13:35:10<15:39:20, 59.96s/it] 46%|████████████████████████████████████████▎                                              | 812/1751 [13:36:07<15:21:58, 58.91s/it]                                                                                                                                     {'loss': '0.5532', 'grad_norm': '0.1924', 'learning_rate': '1.202e-05', 'ppl': '1.739', 'memory/max_active (GiB)': '74.61', 'memory/max_allocated (GiB)': '74.61', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '102.5', 'tokens/total': 1542217472, 'tokens/trainable': 569856576, 'epoch': '1.391'}
 46%|████████████████████████████████████████▎                                              | 812/1751 [13:36:07<15:21:58, 58.91s/it] 46%|████████████████████████████████████████▍                                              | 813/1751 [13:37:06<15:21:48, 58.96s/it]                                                                                                                                     {'loss': '0.5181', 'grad_norm': '0.1865', 'learning_rate': '1.201e-05', 'ppl': '1.679', 'memory/max_active (GiB)': '76.62', 'memory/max_allocated (GiB)': '76.62', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '107.6', 'tokens/total': 1544074880, 'tokens/trainable': 570552320, 'epoch': '1.392'}
 46%|████████████████████████████████████████▍                                              | 813/1751 [13:37:06<15:21:48, 58.96s/it] 46%|████████████████████████████████████████▍                                              | 814/1751 [13:38:03<15:11:02, 58.34s/it]                                                                                                                                     {'loss': '0.5505', 'grad_norm': '0.2021', 'learning_rate': '1.199e-05', 'ppl': '1.734', 'memory/max_active (GiB)': '73.3', 'memory/max_allocated (GiB)': '73.3', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '75.12', 'tokens/total': 1545837184, 'tokens/trainable': 571195648, 'epoch': '1.394'}
 46%|████████████████████████████████████████▍                                              | 814/1751 [13:38:03<15:11:02, 58.34s/it] 47%|████████████████████████████████████████▍                                              | 815/1751 [13:39:02<15:13:01, 58.53s/it]                                                                                                                                     {'loss': '0.554', 'grad_norm': '0.2002', 'learning_rate': '1.197e-05', 'ppl': '1.74', 'memory/max_active (GiB)': '77.46', 'memory/max_allocated (GiB)': '77.46', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '43.36', 'tokens/total': 1547651072, 'tokens/trainable': 571825472, 'epoch': '1.396'}
 47%|████████████████████████████████████████▍                                              | 815/1751 [13:39:02<15:13:01, 58.53s/it] 47%|████████████████████████████████████████▌                                              | 816/1751 [13:40:03<15:23:50, 59.28s/it]                                                                                                                                     {'loss': '0.507', 'grad_norm': '0.1846', 'learning_rate': '1.195e-05', 'ppl': '1.66', 'memory/max_active (GiB)': '72.25', 'memory/max_allocated (GiB)': '72.25', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '126.2', 'tokens/total': 1549604864, 'tokens/trainable': 572559296, 'epoch': '1.398'}
 47%|████████████████████████████████████████▌                                              | 816/1751 [13:40:03<15:23:50, 59.28s/it] 47%|████████████████████████████████████████▌                                              | 817/1751 [13:41:00<15:15:46, 58.83s/it]                                                                                                                                     {'loss': '0.519', 'grad_norm': '0.2002', 'learning_rate': '1.193e-05', 'ppl': '1.68', 'memory/max_active (GiB)': '75.08', 'memory/max_allocated (GiB)': '75.08', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '29.45', 'tokens/total': 1551434752, 'tokens/trainable': 573219136, 'epoch': '1.399'}
 47%|████████████████████████████████████████▌                                              | 817/1751 [13:41:00<15:15:46, 58.83s/it] 47%|████████████████████████████████████████▋                                              | 818/1751 [13:42:03<15:32:42, 59.98s/it]                                                                                                                                     {'loss': '0.5161', 'grad_norm': '0.1934', 'learning_rate': '1.191e-05', 'ppl': '1.675', 'memory/max_active (GiB)': '75.51', 'memory/max_allocated (GiB)': '75.51', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '81.11', 'tokens/total': 1553405440, 'tokens/trainable': 573956224, 'epoch': '1.401'}
 47%|████████████████████████████████████████▋                                              | 818/1751 [13:42:03<15:32:42, 59.98s/it] 47%|████████████████████████████████████████▋                                              | 819/1751 [13:43:04<15:38:41, 60.43s/it]                                                                                                                                     {'loss': '0.5113', 'grad_norm': '0.1875', 'learning_rate': '1.19e-05', 'ppl': '1.667', 'memory/max_active (GiB)': '74.11', 'memory/max_allocated (GiB)': '74.11', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '51.37', 'tokens/total': 1555352448, 'tokens/trainable': 574678528, 'epoch': '1.403'}
 47%|████████████████████████████████████████▋                                              | 819/1751 [13:43:04<15:38:41, 60.43s/it] 47%|████████████████████████████████████████▋                                              | 820/1751 [13:44:05<15:37:11, 60.40s/it]                                                                                                                                     {'loss': '0.5184', 'grad_norm': '0.1826', 'learning_rate': '1.188e-05', 'ppl': '1.679', 'memory/max_active (GiB)': '75.84', 'memory/max_allocated (GiB)': '75.84', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '83.77', 'tokens/total': 1557310976, 'tokens/trainable': 575416000, 'epoch': '1.404'}
 47%|████████████████████████████████████████▋                                              | 820/1751 [13:44:05<15:37:11, 60.40s/it] 47%|████████████████████████████████████████▊                                              | 821/1751 [13:45:06<15:40:02, 60.65s/it]                                                                                                                                     {'loss': '0.5206', 'grad_norm': '0.1855', 'learning_rate': '1.186e-05', 'ppl': '1.683', 'memory/max_active (GiB)': '76.23', 'memory/max_allocated (GiB)': '76.23', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '75.8', 'tokens/total': 1559234816, 'tokens/trainable': 576141952, 'epoch': '1.406'}
 47%|████████████████████████████████████████▊                                              | 821/1751 [13:45:06<15:40:02, 60.65s/it] 47%|████████████████████████████████████████▊                                              | 822/1751 [13:46:07<15:39:21, 60.67s/it]                                                                                                                                     {'loss': '0.513', 'grad_norm': '0.1924', 'learning_rate': '1.184e-05', 'ppl': '1.67', 'memory/max_active (GiB)': '77.24', 'memory/max_allocated (GiB)': '77.24', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '22.25', 'tokens/total': 1561140736, 'tokens/trainable': 576867840, 'epoch': '1.408'}
 47%|████████████████████████████████████████▊                                              | 822/1751 [13:46:07<15:39:21, 60.67s/it] 47%|████████████████████████████████████████▉                                              | 823/1751 [13:47:04<15:20:53, 59.54s/it]                                                                                                                                     {'loss': '0.5269', 'grad_norm': '0.1924', 'learning_rate': '1.182e-05', 'ppl': '1.694', 'memory/max_active (GiB)': '72.41', 'memory/max_allocated (GiB)': '72.41', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '41.91', 'tokens/total': 1562891904, 'tokens/trainable': 577514496, 'epoch': '1.41'}
 47%|████████████████████████████████████████▉                                              | 823/1751 [13:47:04<15:20:53, 59.54s/it] 47%|████████████████████████████████████████▉                                              | 824/1751 [13:48:04<15:23:13, 59.76s/it]                                                                                                                                     {'loss': '0.5074', 'grad_norm': '0.1885', 'learning_rate': '1.18e-05', 'ppl': '1.661', 'memory/max_active (GiB)': '76.49', 'memory/max_allocated (GiB)': '76.49', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '46.69', 'tokens/total': 1564771200, 'tokens/trainable': 578201152, 'epoch': '1.411'}
 47%|████████████████████████████████████████▉                                              | 824/1751 [13:48:04<15:23:13, 59.76s/it] 47%|████████████████████████████████████████▉                                              | 825/1751 [13:49:04<15:21:58, 59.74s/it]                                                                                                                                     {'loss': '0.524', 'grad_norm': '0.1973', 'learning_rate': '1.178e-05', 'ppl': '1.689', 'memory/max_active (GiB)': '75.59', 'memory/max_allocated (GiB)': '75.59', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '159.5', 'tokens/total': 1566551040, 'tokens/trainable': 578877632, 'epoch': '1.413'}
 47%|████████████████████████████████████████▉                                              | 825/1751 [13:49:04<15:21:58, 59.74s/it] 47%|█████████████████████████████████████████                                              | 826/1751 [13:50:05<15:29:09, 60.27s/it]                                                                                                                                     {'loss': '0.5301', 'grad_norm': '0.1758', 'learning_rate': '1.177e-05', 'ppl': '1.699', 'memory/max_active (GiB)': '73.19', 'memory/max_allocated (GiB)': '73.19', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '53.25', 'tokens/total': 1568443136, 'tokens/trainable': 579592960, 'epoch': '1.415'}
 47%|█████████████████████████████████████████                                              | 826/1751 [13:50:05<15:29:09, 60.27s/it] 47%|█████████████████████████████████████████                                              | 827/1751 [13:51:04<15:20:23, 59.77s/it]                                                                                                                                     {'loss': '0.5263', 'grad_norm': '0.1943', 'learning_rate': '1.175e-05', 'ppl': '1.693', 'memory/max_active (GiB)': '74.54', 'memory/max_allocated (GiB)': '74.54', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '59.17', 'tokens/total': 1570297472, 'tokens/trainable': 580275392, 'epoch': '1.416'}
 47%|█████████████████████████████████████████                                              | 827/1751 [13:51:04<15:20:23, 59.77s/it] 47%|█████████████████████████████████████████▏                                             | 828/1751 [13:52:04<15:21:33, 59.91s/it]                                                                                                                                     {'loss': '0.5238', 'grad_norm': '0.208', 'learning_rate': '1.173e-05', 'ppl': '1.688', 'memory/max_active (GiB)': '76.21', 'memory/max_allocated (GiB)': '76.21', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '25.72', 'tokens/total': 1572194560, 'tokens/trainable': 580936704, 'epoch': '1.418'}
 47%|█████████████████████████████████████████▏                                             | 828/1751 [13:52:04<15:21:33, 59.91s/it] 47%|█████████████████████████████████████████▏                                             | 829/1751 [13:53:07<15:34:34, 60.82s/it]                                                                                                                                     {'loss': '0.5127', 'grad_norm': '0.1895', 'learning_rate': '1.171e-05', 'ppl': '1.67', 'memory/max_active (GiB)': '77.62', 'memory/max_allocated (GiB)': '77.62', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '92.45', 'tokens/total': 1574204032, 'tokens/trainable': 581650688, 'epoch': '1.42'}
 47%|█████████████████████████████████████████▏                                             | 829/1751 [13:53:07<15:34:34, 60.82s/it] 47%|█████████████████████████████████████████▏                                             | 830/1751 [13:54:08<15:33:04, 60.79s/it]                                                                                                                                     {'loss': '0.5446', 'grad_norm': '0.1934', 'learning_rate': '1.169e-05', 'ppl': '1.724', 'memory/max_active (GiB)': '74.11', 'memory/max_allocated (GiB)': '74.11', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '109.8', 'tokens/total': 1576117504, 'tokens/trainable': 582346880, 'epoch': '1.422'}
 47%|█████████████████████████████████████████▏                                             | 830/1751 [13:54:08<15:33:04, 60.79s/it] 47%|█████████████████████████████████████████▎                                             | 831/1751 [13:55:08<15:30:50, 60.71s/it]                                                                                                                                     {'loss': '0.527', 'grad_norm': '0.1855', 'learning_rate': '1.167e-05', 'ppl': '1.694', 'memory/max_active (GiB)': '75.95', 'memory/max_allocated (GiB)': '75.95', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '71.47', 'tokens/total': 1577986432, 'tokens/trainable': 583036544, 'epoch': '1.423'}
 47%|█████████████████████████████████████████▎                                             | 831/1751 [13:55:08<15:30:50, 60.71s/it] 48%|█████████████████████████████████████████▎                                             | 832/1751 [13:56:08<15:24:53, 60.38s/it]                                                                                                                                     {'loss': '0.5385', 'grad_norm': '0.1885', 'learning_rate': '1.165e-05', 'ppl': '1.713', 'memory/max_active (GiB)': '74.49', 'memory/max_allocated (GiB)': '74.49', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '153.6', 'tokens/total': 1579881472, 'tokens/trainable': 583702464, 'epoch': '1.425'}
 48%|█████████████████████████████████████████▎                                             | 832/1751 [13:56:08<15:24:53, 60.38s/it] 48%|█████████████████████████████████████████▍                                             | 833/1751 [13:57:07<15:20:06, 60.14s/it]                                                                                                                                     {'loss': '0.535', 'grad_norm': '0.1992', 'learning_rate': '1.164e-05', 'ppl': '1.707', 'memory/max_active (GiB)': '73.8', 'memory/max_allocated (GiB)': '73.8', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '27.69', 'tokens/total': 1581802112, 'tokens/trainable': 584413056, 'epoch': '1.427'}
 48%|█████████████████████████████████████████▍                                             | 833/1751 [13:57:07<15:20:06, 60.14s/it] 48%|█████████████████████████████████████████▍                                             | 834/1751 [13:58:02<14:55:34, 58.60s/it]                                                                                                                                     {'loss': '0.5765', 'grad_norm': '0.207', 'learning_rate': '1.162e-05', 'ppl': '1.78', 'memory/max_active (GiB)': '74.26', 'memory/max_allocated (GiB)': '74.26', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '63.62', 'tokens/total': 1583460864, 'tokens/trainable': 585001664, 'epoch': '1.428'}
 48%|█████████████████████████████████████████▍                                             | 834/1751 [13:58:02<14:55:34, 58.60s/it] 48%|█████████████████████████████████████████▍                                             | 835/1751 [13:59:03<15:02:51, 59.14s/it]                                                                                                                                     {'loss': '0.5201', 'grad_norm': '0.1855', 'learning_rate': '1.16e-05', 'ppl': '1.682', 'memory/max_active (GiB)': '70.56', 'memory/max_allocated (GiB)': '70.56', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '48.42', 'tokens/total': 1585357184, 'tokens/trainable': 585700800, 'epoch': '1.43'}
 48%|█████████████████████████████████████████▍                                             | 835/1751 [13:59:03<15:02:51, 59.14s/it] 48%|█████████████████████████████████████████▌                                             | 836/1751 [14:00:03<15:08:40, 59.59s/it]                                                                                                                                     {'loss': '0.4986', 'grad_norm': '0.1836', 'learning_rate': '1.158e-05', 'ppl': '1.646', 'memory/max_active (GiB)': '76.12', 'memory/max_allocated (GiB)': '76.12', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '107.9', 'tokens/total': 1587300224, 'tokens/trainable': 586418752, 'epoch': '1.432'}
 48%|█████████████████████████████████████████▌                                             | 836/1751 [14:00:03<15:08:40, 59.59s/it] 48%|█████████████████████████████████████████▌                                             | 837/1751 [14:01:04<15:13:26, 59.96s/it]                                                                                                                                     {'loss': '0.5429', 'grad_norm': '0.1787', 'learning_rate': '1.156e-05', 'ppl': '1.721', 'memory/max_active (GiB)': '75.92', 'memory/max_allocated (GiB)': '75.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '93.89', 'tokens/total': 1589244672, 'tokens/trainable': 587177856, 'epoch': '1.434'}
 48%|█████████████████████████████████████████▌                                             | 837/1751 [14:01:04<15:13:26, 59.96s/it] 48%|█████████████████████████████████████████▋                                             | 838/1751 [14:02:02<15:04:13, 59.42s/it]                                                                                                                                     {'loss': '0.5117', 'grad_norm': '0.1826', 'learning_rate': '1.154e-05', 'ppl': '1.668', 'memory/max_active (GiB)': '76.39', 'memory/max_allocated (GiB)': '76.39', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '155.4', 'tokens/total': 1591080064, 'tokens/trainable': 587846400, 'epoch': '1.435'}
 48%|█████████████████████████████████████████▋                                             | 838/1751 [14:02:02<15:04:13, 59.42s/it] 48%|█████████████████████████████████████████▋                                             | 839/1751 [14:03:02<15:06:01, 59.61s/it]                                                                                                                                     {'loss': '0.488', 'grad_norm': '0.1846', 'learning_rate': '1.152e-05', 'ppl': '1.629', 'memory/max_active (GiB)': '71.93', 'memory/max_allocated (GiB)': '71.93', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '46.57', 'tokens/total': 1592978560, 'tokens/trainable': 588528768, 'epoch': '1.437'}
 48%|█████████████████████████████████████████▋                                             | 839/1751 [14:03:02<15:06:01, 59.61s/it] 48%|█████████████████████████████████████████▋                                             | 840/1751 [14:04:02<15:03:24, 59.50s/it]                                                                                                                                     {'loss': '0.5129', 'grad_norm': '0.1895', 'learning_rate': '1.15e-05', 'ppl': '1.67', 'memory/max_active (GiB)': '74.24', 'memory/max_allocated (GiB)': '74.24', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '104.5', 'tokens/total': 1594795392, 'tokens/trainable': 589179264, 'epoch': '1.439'}
 48%|█████████████████████████████████████████▋                                             | 840/1751 [14:04:02<15:03:24, 59.50s/it] 48%|█████████████████████████████████████████▊                                             | 841/1751 [14:05:05<15:19:57, 60.66s/it]                                                                                                                                     {'loss': '0.5102', 'grad_norm': '0.1768', 'learning_rate': '1.149e-05', 'ppl': '1.666', 'memory/max_active (GiB)': '76.22', 'memory/max_allocated (GiB)': '76.22', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '84.19', 'tokens/total': 1596795904, 'tokens/trainable': 589936704, 'epoch': '1.44'}
 48%|█████████████████████████████████████████▊                                             | 841/1751 [14:05:05<15:19:57, 60.66s/it] 48%|█████████████████████████████████████████▊                                             | 842/1751 [14:06:05<15:14:05, 60.34s/it]                                                                                                                                     {'loss': '0.4932', 'grad_norm': '0.1738', 'learning_rate': '1.147e-05', 'ppl': '1.638', 'memory/max_active (GiB)': '72.21', 'memory/max_allocated (GiB)': '72.21', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '23.22', 'tokens/total': 1598683392, 'tokens/trainable': 590642240, 'epoch': '1.442'}
 48%|█████████████████████████████████████████▊                                             | 842/1751 [14:06:05<15:14:05, 60.34s/it] 48%|█████████████████████████████████████████▉                                             | 843/1751 [14:07:05<15:14:33, 60.43s/it]                                                                                                                                     {'loss': '0.5151', 'grad_norm': '0.1992', 'learning_rate': '1.145e-05', 'ppl': '1.674', 'memory/max_active (GiB)': '70.96', 'memory/max_allocated (GiB)': '70.96', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '84.73', 'tokens/total': 1600590464, 'tokens/trainable': 591337280, 'epoch': '1.444'}
 48%|█████████████████████████████████████████▉                                             | 843/1751 [14:07:05<15:14:33, 60.43s/it] 48%|█████████████████████████████████████████▉                                             | 844/1751 [14:08:05<15:09:44, 60.18s/it]                                                                                                                                     {'loss': '0.5342', 'grad_norm': '0.1914', 'learning_rate': '1.143e-05', 'ppl': '1.706', 'memory/max_active (GiB)': '75.05', 'memory/max_allocated (GiB)': '75.05', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '230.3', 'tokens/total': 1602457088, 'tokens/trainable': 592009600, 'epoch': '1.446'}
 48%|█████████████████████████████████████████▉                                             | 844/1751 [14:08:05<15:09:44, 60.18s/it] 48%|█████████████████████████████████████████▉                                             | 845/1751 [14:09:05<15:06:56, 60.06s/it]                                                                                                                                     {'loss': '0.5088', 'grad_norm': '0.1904', 'learning_rate': '1.141e-05', 'ppl': '1.663', 'memory/max_active (GiB)': '76.76', 'memory/max_allocated (GiB)': '76.76', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '70.88', 'tokens/total': 1604353536, 'tokens/trainable': 592692288, 'epoch': '1.447'}
 48%|█████████████████████████████████████████▉                                             | 845/1751 [14:09:05<15:06:56, 60.06s/it] 48%|██████████████████████████████████████████                                             | 846/1751 [14:10:07<15:16:37, 60.77s/it]                                                                                                                                     {'loss': '0.5246', 'grad_norm': '0.1846', 'learning_rate': '1.139e-05', 'ppl': '1.69', 'memory/max_active (GiB)': '76.19', 'memory/max_allocated (GiB)': '76.19', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '84.63', 'tokens/total': 1606316288, 'tokens/trainable': 593430016, 'epoch': '1.449'}
 48%|██████████████████████████████████████████                                             | 846/1751 [14:10:07<15:16:37, 60.77s/it] 48%|██████████████████████████████████████████                                             | 847/1751 [14:11:06<15:09:13, 60.35s/it]                                                                                                                                     {'loss': '0.5132', 'grad_norm': '0.1963', 'learning_rate': '1.137e-05', 'ppl': '1.671', 'memory/max_active (GiB)': '74.79', 'memory/max_allocated (GiB)': '74.79', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '194.6', 'tokens/total': 1608161792, 'tokens/trainable': 594109056, 'epoch': '1.451'}
 48%|██████████████████████████████████████████                                             | 847/1751 [14:11:06<15:09:13, 60.35s/it] 48%|██████████████████████████████████████████▏                                            | 848/1751 [14:12:06<15:03:28, 60.03s/it]                                                                                                                                     {'loss': '0.5491', 'grad_norm': '0.1963', 'learning_rate': '1.136e-05', 'ppl': '1.732', 'memory/max_active (GiB)': '75.34', 'memory/max_allocated (GiB)': '75.34', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '66.68', 'tokens/total': 1610024192, 'tokens/trainable': 594785088, 'epoch': '1.452'}
 48%|██████████████████████████████████████████▏                                            | 848/1751 [14:12:06<15:03:28, 60.03s/it] 48%|██████████████████████████████████████████▏                                            | 849/1751 [14:13:06<15:02:00, 60.00s/it]                                                                                                                                     {'loss': '0.5371', 'grad_norm': '0.1895', 'learning_rate': '1.134e-05', 'ppl': '1.711', 'memory/max_active (GiB)': '73.65', 'memory/max_allocated (GiB)': '73.65', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '102', 'tokens/total': 1611941504, 'tokens/trainable': 595474176, 'epoch': '1.454'}
 48%|██████████████████████████████████████████▏                                            | 849/1751 [14:13:06<15:02:00, 60.00s/it] 49%|██████████████████████████████████████████▏                                            | 850/1751 [14:14:05<14:59:33, 59.90s/it]                                                                                                                                     {'loss': '0.5272', 'grad_norm': '0.1934', 'learning_rate': '1.132e-05', 'ppl': '1.694', 'memory/max_active (GiB)': '73.19', 'memory/max_allocated (GiB)': '73.19', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '103.8', 'tokens/total': 1613849088, 'tokens/trainable': 596192768, 'epoch': '1.456'}
 49%|██████████████████████████████████████████▏                                            | 850/1751 [14:14:05<14:59:33, 59.90s/it] 49%|██████████████████████████████████████████▎                                            | 851/1751 [14:15:04<14:53:47, 59.59s/it]                                                                                                                                     {'loss': '0.5201', 'grad_norm': '0.1826', 'learning_rate': '1.13e-05', 'ppl': '1.682', 'memory/max_active (GiB)': '72.71', 'memory/max_allocated (GiB)': '72.71', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '76.16', 'tokens/total': 1615692416, 'tokens/trainable': 596869120, 'epoch': '1.458'}
 49%|██████████████████████████████████████████▎                                            | 851/1751 [14:15:04<14:53:47, 59.59s/it] 49%|██████████████████████████████████████████▎                                            | 852/1751 [14:16:08<15:11:30, 60.83s/it]                                                                                                                                     {'loss': '0.5437', 'grad_norm': '0.1904', 'learning_rate': '1.128e-05', 'ppl': '1.722', 'memory/max_active (GiB)': '72.24', 'memory/max_allocated (GiB)': '72.24', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '59.13', 'tokens/total': 1617692672, 'tokens/trainable': 597602752, 'epoch': '1.459'}
 49%|██████████████████████████████████████████▎                                            | 852/1751 [14:16:08<15:11:30, 60.83s/it] 49%|██████████████████████████████████████████▍                                            | 853/1751 [14:17:11<15:19:55, 61.46s/it]                                                                                                                                     {'loss': '0.493', 'grad_norm': '0.1836', 'learning_rate': '1.126e-05', 'ppl': '1.637', 'memory/max_active (GiB)': '76.67', 'memory/max_allocated (GiB)': '76.67', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '43.48', 'tokens/total': 1619649408, 'tokens/trainable': 598340096, 'epoch': '1.461'}
 49%|██████████████████████████████████████████▍                                            | 853/1751 [14:17:11<15:19:55, 61.46s/it] 49%|██████████████████████████████████████████▍                                            | 854/1751 [14:18:13<15:20:30, 61.57s/it]                                                                                                                                     {'loss': '0.4951', 'grad_norm': '0.1885', 'learning_rate': '1.124e-05', 'ppl': '1.641', 'memory/max_active (GiB)': '76.97', 'memory/max_allocated (GiB)': '76.97', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '62.76', 'tokens/total': 1621619456, 'tokens/trainable': 599085568, 'epoch': '1.463'}
 49%|██████████████████████████████████████████▍                                            | 854/1751 [14:18:13<15:20:30, 61.57s/it] 49%|██████████████████████████████████████████▍                                            | 855/1751 [14:19:12<15:08:17, 60.82s/it]                                                                                                                                     {'loss': '0.5379', 'grad_norm': '0.1885', 'learning_rate': '1.122e-05', 'ppl': '1.712', 'memory/max_active (GiB)': '75.64', 'memory/max_allocated (GiB)': '75.64', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '64.95', 'tokens/total': 1623460864, 'tokens/trainable': 599788928, 'epoch': '1.464'}
 49%|██████████████████████████████████████████▍                                            | 855/1751 [14:19:12<15:08:17, 60.82s/it] 49%|██████████████████████████████████████████▌                                            | 856/1751 [14:20:13<15:10:31, 61.04s/it]                                                                                                                                     {'loss': '0.5018', 'grad_norm': '0.1807', 'learning_rate': '1.121e-05', 'ppl': '1.652', 'memory/max_active (GiB)': '75.26', 'memory/max_allocated (GiB)': '75.26', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '146', 'tokens/total': 1625398144, 'tokens/trainable': 600512256, 'epoch': '1.466'}
 49%|██████████████████████████████████████████▌                                            | 856/1751 [14:20:13<15:10:31, 61.04s/it] 49%|██████████████████████████████████████████▌                                            | 857/1751 [14:21:13<15:02:49, 60.59s/it]                                                                                                                                     {'loss': '0.4972', 'grad_norm': '0.1865', 'learning_rate': '1.119e-05', 'ppl': '1.644', 'memory/max_active (GiB)': '76.33', 'memory/max_allocated (GiB)': '76.33', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '115.3', 'tokens/total': 1627261184, 'tokens/trainable': 601206592, 'epoch': '1.468'}
 49%|██████████████████████████████████████████▌                                            | 857/1751 [14:21:13<15:02:49, 60.59s/it] 49%|██████████████████████████████████████████▋                                            | 858/1751 [14:22:13<14:58:38, 60.38s/it]                                                                                                                                     {'loss': '0.51', 'grad_norm': '0.1973', 'learning_rate': '1.117e-05', 'ppl': '1.665', 'memory/max_active (GiB)': '72.24', 'memory/max_allocated (GiB)': '72.24', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '51.83', 'tokens/total': 1629124992, 'tokens/trainable': 601868352, 'epoch': '1.47'}
 49%|██████████████████████████████████████████▋                                            | 858/1751 [14:22:13<14:58:38, 60.38s/it] 49%|██████████████████████████████████████████▋                                            | 859/1751 [14:23:14<15:01:06, 60.61s/it]                                                                                                                                     {'loss': '0.5226', 'grad_norm': '0.1875', 'learning_rate': '1.115e-05', 'ppl': '1.686', 'memory/max_active (GiB)': '74.89', 'memory/max_allocated (GiB)': '74.89', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '93.85', 'tokens/total': 1631027072, 'tokens/trainable': 602600320, 'epoch': '1.471'}
 49%|██████████████████████████████████████████▋                                            | 859/1751 [14:23:14<15:01:06, 60.61s/it] 49%|██████████████████████████████████████████▋                                            | 860/1751 [14:24:18<15:15:10, 61.63s/it]                                                                                                                                     {'loss': '0.5313', 'grad_norm': '0.1807', 'learning_rate': '1.113e-05', 'ppl': '1.701', 'memory/max_active (GiB)': '75.64', 'memory/max_allocated (GiB)': '75.64', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '88.91', 'tokens/total': 1633070080, 'tokens/trainable': 603350464, 'epoch': '1.473'}
 49%|██████████████████████████████████████████▋                                            | 860/1751 [14:24:18<15:15:10, 61.63s/it] 49%|██████████████████████████████████████████▊                                            | 861/1751 [14:25:20<15:14:43, 61.67s/it]                                                                                                                                     {'loss': '0.5114', 'grad_norm': '0.1748', 'learning_rate': '1.111e-05', 'ppl': '1.668', 'memory/max_active (GiB)': '74.48', 'memory/max_allocated (GiB)': '74.48', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '45.11', 'tokens/total': 1635060608, 'tokens/trainable': 604098816, 'epoch': '1.475'}
 49%|██████████████████████████████████████████▊                                            | 861/1751 [14:25:20<15:14:43, 61.67s/it] 49%|██████████████████████████████████████████▊                                            | 862/1751 [14:26:20<15:06:20, 61.17s/it]                                                                                                                                     {'loss': '0.4978', 'grad_norm': '0.1758', 'learning_rate': '1.109e-05', 'ppl': '1.645', 'memory/max_active (GiB)': '70.35', 'memory/max_allocated (GiB)': '70.35', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '155', 'tokens/total': 1636966528, 'tokens/trainable': 604821952, 'epoch': '1.476'}
 49%|██████████████████████████████████████████▊                                            | 862/1751 [14:26:20<15:06:20, 61.17s/it] 49%|██████████████████████████████████████████▉                                            | 863/1751 [14:27:19<14:58:05, 60.68s/it]                                                                                                                                     {'loss': '0.5345', 'grad_norm': '0.2051', 'learning_rate': '1.107e-05', 'ppl': '1.707', 'memory/max_active (GiB)': '76.81', 'memory/max_allocated (GiB)': '76.81', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '31.77', 'tokens/total': 1638850560, 'tokens/trainable': 605534336, 'epoch': '1.478'}
 49%|██████████████████████████████████████████▉                                            | 863/1751 [14:27:19<14:58:05, 60.68s/it] 49%|██████████████████████████████████████████▉                                            | 864/1751 [14:28:21<15:01:10, 60.96s/it]                                                                                                                                     {'loss': '0.5596', 'grad_norm': '0.1904', 'learning_rate': '1.106e-05', 'ppl': '1.75', 'memory/max_active (GiB)': '77.4', 'memory/max_allocated (GiB)': '77.4', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '102.5', 'tokens/total': 1640836352, 'tokens/trainable': 606284672, 'epoch': '1.48'}
 49%|██████████████████████████████████████████▉                                            | 864/1751 [14:28:21<15:01:10, 60.96s/it] 49%|██████████████████████████████████████████▉                                            | 865/1751 [14:29:19<14:47:04, 60.07s/it]                                                                                                                                     {'loss': '0.544', 'grad_norm': '0.1836', 'learning_rate': '1.104e-05', 'ppl': '1.723', 'memory/max_active (GiB)': '77.48', 'memory/max_allocated (GiB)': '77.48', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '98.3', 'tokens/total': 1642690176, 'tokens/trainable': 606981824, 'epoch': '1.482'}
 49%|██████████████████████████████████████████▉                                            | 865/1751 [14:29:19<14:47:04, 60.07s/it] 49%|███████████████████████████████████████████                                            | 866/1751 [14:30:17<14:36:40, 59.44s/it]                                                                                                                                     {'loss': '0.5233', 'grad_norm': '0.1914', 'learning_rate': '1.102e-05', 'ppl': '1.688', 'memory/max_active (GiB)': '74.71', 'memory/max_allocated (GiB)': '74.71', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '107.2', 'tokens/total': 1644513152, 'tokens/trainable': 607647232, 'epoch': '1.483'}
 49%|███████████████████████████████████████████                                            | 866/1751 [14:30:17<14:36:40, 59.44s/it] 50%|███████████████████████████████████████████                                            | 867/1751 [14:31:17<14:39:48, 59.71s/it]                                                                                                                                     {'loss': '0.5037', 'grad_norm': '0.1719', 'learning_rate': '1.1e-05', 'ppl': '1.655', 'memory/max_active (GiB)': '77.89', 'memory/max_allocated (GiB)': '77.89', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '133.4', 'tokens/total': 1646456064, 'tokens/trainable': 608392064, 'epoch': '1.485'}
 50%|███████████████████████████████████████████                                            | 867/1751 [14:31:17<14:39:48, 59.71s/it] 50%|███████████████████████████████████████████▏                                           | 868/1751 [14:32:18<14:43:01, 60.00s/it]                                                                                                                                     {'loss': '0.5141', 'grad_norm': '0.1826', 'learning_rate': '1.098e-05', 'ppl': '1.672', 'memory/max_active (GiB)': '74.06', 'memory/max_allocated (GiB)': '74.06', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '96.71', 'tokens/total': 1648392832, 'tokens/trainable': 609092608, 'epoch': '1.487'}
 50%|███████████████████████████████████████████▏                                           | 868/1751 [14:32:18<14:43:01, 60.00s/it] 50%|███████████████████████████████████████████▏                                           | 869/1751 [14:33:18<14:41:03, 59.94s/it]                                                                                                                                     {'loss': '0.539', 'grad_norm': '0.2119', 'learning_rate': '1.096e-05', 'ppl': '1.714', 'memory/max_active (GiB)': '76.15', 'memory/max_allocated (GiB)': '76.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '87.41', 'tokens/total': 1650288512, 'tokens/trainable': 609793984, 'epoch': '1.488'}
 50%|███████████████████████████████████████████▏                                           | 869/1751 [14:33:18<14:41:03, 59.94s/it] 50%|███████████████████████████████████████████▏                                           | 870/1751 [14:34:17<14:39:43, 59.91s/it]                                                                                                                                     {'loss': '0.5181', 'grad_norm': '0.1904', 'learning_rate': '1.094e-05', 'ppl': '1.679', 'memory/max_active (GiB)': '72.45', 'memory/max_allocated (GiB)': '72.45', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '170.1', 'tokens/total': 1652187264, 'tokens/trainable': 610492864, 'epoch': '1.49'}
 50%|███████████████████████████████████████████▏                                           | 870/1751 [14:34:17<14:39:43, 59.91s/it] 50%|███████████████████████████████████████████▎                                           | 871/1751 [14:35:17<14:38:27, 59.90s/it]                                                                                                                                     {'loss': '0.5451', 'grad_norm': '0.1934', 'learning_rate': '1.092e-05', 'ppl': '1.725', 'memory/max_active (GiB)': '75.21', 'memory/max_allocated (GiB)': '75.21', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '115.6', 'tokens/total': 1654074752, 'tokens/trainable': 611226880, 'epoch': '1.492'}
 50%|███████████████████████████████████████████▎                                           | 871/1751 [14:35:17<14:38:27, 59.90s/it] 50%|███████████████████████████████████████████▎                                           | 872/1751 [14:36:19<14:43:19, 60.30s/it]                                                                                                                                     {'loss': '0.51', 'grad_norm': '0.1807', 'learning_rate': '1.09e-05', 'ppl': '1.665', 'memory/max_active (GiB)': '76.86', 'memory/max_allocated (GiB)': '76.86', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '71.4', 'tokens/total': 1656019456, 'tokens/trainable': 611948992, 'epoch': '1.494'}
 50%|███████████████████████████████████████████▎                                           | 872/1751 [14:36:19<14:43:19, 60.30s/it] 50%|███████████████████████████████████████████▍                                           | 873/1751 [14:37:21<14:54:03, 61.10s/it]                                                                                                                                     {'loss': '0.4928', 'grad_norm': '0.1855', 'learning_rate': '1.089e-05', 'ppl': '1.637', 'memory/max_active (GiB)': '74.77', 'memory/max_allocated (GiB)': '74.77', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '99.08', 'tokens/total': 1658007040, 'tokens/trainable': 612673792, 'epoch': '1.495'}
 50%|███████████████████████████████████████████▍                                           | 873/1751 [14:37:21<14:54:03, 61.10s/it] 50%|███████████████████████████████████████████▍                                           | 874/1751 [14:38:23<14:54:43, 61.21s/it]                                                                                                                                     {'loss': '0.5331', 'grad_norm': '0.1836', 'learning_rate': '1.087e-05', 'ppl': '1.704', 'memory/max_active (GiB)': '71.93', 'memory/max_allocated (GiB)': '71.93', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '26.1', 'tokens/total': 1659934336, 'tokens/trainable': 613400000, 'epoch': '1.497'}
 50%|███████████████████████████████████████████▍                                           | 874/1751 [14:38:23<14:54:43, 61.21s/it] 50%|███████████████████████████████████████████▍                                           | 875/1751 [14:39:24<14:51:00, 61.03s/it]                                                                                                                                     {'loss': '0.5545', 'grad_norm': '0.2002', 'learning_rate': '1.085e-05', 'ppl': '1.741', 'memory/max_active (GiB)': '74.93', 'memory/max_allocated (GiB)': '74.93', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '52.87', 'tokens/total': 1661867136, 'tokens/trainable': 614104384, 'epoch': '1.499'}
 50%|███████████████████████████████████████████▍                                           | 875/1751 [14:39:24<14:51:00, 61.03s/it] 50%|███████████████████████████████████████████▌                                           | 876/1751 [14:40:20<14:31:45, 59.78s/it]                                                                                                                                     {'loss': '0.5061', 'grad_norm': '0.2012', 'learning_rate': '1.083e-05', 'ppl': '1.659', 'memory/max_active (GiB)': '72.75', 'memory/max_allocated (GiB)': '72.75', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '66.34', 'tokens/total': 1663682688, 'tokens/trainable': 614745408, 'epoch': '1.5'}
 50%|███████████████████████████████████████████▌                                           | 876/1751 [14:40:20<14:31:45, 59.78s/it] 50%|███████████████████████████████████████████▌                                           | 877/1751 [14:41:19<14:26:29, 59.48s/it]                                                                                                                                     {'loss': '0.5275', 'grad_norm': '0.1855', 'learning_rate': '1.081e-05', 'ppl': '1.695', 'memory/max_active (GiB)': '71.26', 'memory/max_allocated (GiB)': '71.26', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '193.6', 'tokens/total': 1665559936, 'tokens/trainable': 615454464, 'epoch': '1.502'}
 50%|███████████████████████████████████████████▌                                           | 877/1751 [14:41:19<14:26:29, 59.48s/it] 50%|███████████████████████████████████████████▌                                           | 878/1751 [14:42:22<14:38:11, 60.36s/it]                                                                                                                                     {'loss': '0.5005', 'grad_norm': '0.1719', 'learning_rate': '1.079e-05', 'ppl': '1.65', 'memory/max_active (GiB)': '73.2', 'memory/max_allocated (GiB)': '73.2', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '102.6', 'tokens/total': 1667589632, 'tokens/trainable': 616210560, 'epoch': '1.504'}
 50%|███████████████████████████████████████████▌                                           | 878/1751 [14:42:22<14:38:11, 60.36s/it] 50%|███████████████████████████████████████████▋                                           | 879/1751 [14:43:22<14:35:13, 60.22s/it]                                                                                                                                     {'loss': '0.4961', 'grad_norm': '0.1826', 'learning_rate': '1.077e-05', 'ppl': '1.642', 'memory/max_active (GiB)': '75.75', 'memory/max_allocated (GiB)': '75.75', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '46.02', 'tokens/total': 1669528064, 'tokens/trainable': 616892608, 'epoch': '1.506'}
 50%|███████████████████████████████████████████▋                                           | 879/1751 [14:43:22<14:35:13, 60.22s/it] 50%|███████████████████████████████████████████▋                                           | 880/1751 [14:44:19<14:23:10, 59.46s/it]                                                                                                                                     {'loss': '0.5473', 'grad_norm': '0.1924', 'learning_rate': '1.075e-05', 'ppl': '1.729', 'memory/max_active (GiB)': '71.87', 'memory/max_allocated (GiB)': '71.87', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '132.3', 'tokens/total': 1671366400, 'tokens/trainable': 617572288, 'epoch': '1.507'}
 50%|███████████████████████████████████████████▋                                           | 880/1751 [14:44:19<14:23:10, 59.46s/it] 50%|███████████████████████████████████████████▊                                           | 881/1751 [14:45:17<14:16:23, 59.06s/it]                                                                                                                                     {'loss': '0.5204', 'grad_norm': '0.1836', 'learning_rate': '1.074e-05', 'ppl': '1.683', 'memory/max_active (GiB)': '77.1', 'memory/max_allocated (GiB)': '77.1', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '64.38', 'tokens/total': 1673241472, 'tokens/trainable': 618259776, 'epoch': '1.509'}
 50%|███████████████████████████████████████████▊                                           | 881/1751 [14:45:17<14:16:23, 59.06s/it] 50%|███████████████████████████████████████████▊                                           | 882/1751 [14:46:18<14:21:20, 59.47s/it]                                                                                                                                     {'loss': '0.4859', 'grad_norm': '0.1836', 'learning_rate': '1.072e-05', 'ppl': '1.626', 'memory/max_active (GiB)': '73.74', 'memory/max_allocated (GiB)': '73.74', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '163.4', 'tokens/total': 1675160576, 'tokens/trainable': 618958592, 'epoch': '1.511'}
 50%|███████████████████████████████████████████▊                                           | 882/1751 [14:46:18<14:21:20, 59.47s/it] 50%|███████████████████████████████████████████▊                                           | 883/1751 [14:47:21<14:36:25, 60.58s/it]                                                                                                                                     {'loss': '0.5073', 'grad_norm': '0.1807', 'learning_rate': '1.07e-05', 'ppl': '1.661', 'memory/max_active (GiB)': '76.46', 'memory/max_allocated (GiB)': '76.46', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '131.2', 'tokens/total': 1677195520, 'tokens/trainable': 619737280, 'epoch': '1.512'}
 50%|███████████████████████████████████████████▊                                           | 883/1751 [14:47:21<14:36:25, 60.58s/it] 50%|███████████████████████████████████████████▉                                           | 884/1751 [14:48:23<14:41:39, 61.01s/it]                                                                                                                                     {'loss': '0.5299', 'grad_norm': '0.1836', 'learning_rate': '1.068e-05', 'ppl': '1.699', 'memory/max_active (GiB)': '75.2', 'memory/max_allocated (GiB)': '75.2', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '136.2', 'tokens/total': 1679221760, 'tokens/trainable': 620492352, 'epoch': '1.514'}
 50%|███████████████████████████████████████████▉                                           | 884/1751 [14:48:23<14:41:39, 61.01s/it] 51%|███████████████████████████████████████████▉                                           | 885/1751 [14:49:24<14:39:10, 60.91s/it]                                                                                                                                     {'loss': '0.499', 'grad_norm': '0.1768', 'learning_rate': '1.066e-05', 'ppl': '1.647', 'memory/max_active (GiB)': '76.67', 'memory/max_allocated (GiB)': '76.67', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '99.5', 'tokens/total': 1681163392, 'tokens/trainable': 621193216, 'epoch': '1.516'}
 51%|███████████████████████████████████████████▉                                           | 885/1751 [14:49:24<14:39:10, 60.91s/it] 51%|████████████████████████████████████████████                                           | 886/1751 [14:50:23<14:31:02, 60.42s/it]                                                                                                                                     {'loss': '0.5154', 'grad_norm': '0.1846', 'learning_rate': '1.064e-05', 'ppl': '1.674', 'memory/max_active (GiB)': '71.28', 'memory/max_allocated (GiB)': '71.28', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '32.43', 'tokens/total': 1683049984, 'tokens/trainable': 621897088, 'epoch': '1.518'}
 51%|████████████████████████████████████████████                                           | 886/1751 [14:50:23<14:31:02, 60.42s/it] 51%|████████████████████████████████████████████                                           | 887/1751 [14:51:24<14:33:51, 60.69s/it]                                                                                                                                     {'loss': '0.4991', 'grad_norm': '0.1787', 'learning_rate': '1.062e-05', 'ppl': '1.647', 'memory/max_active (GiB)': '74.12', 'memory/max_allocated (GiB)': '74.12', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '108.5', 'tokens/total': 1684943744, 'tokens/trainable': 622590976, 'epoch': '1.519'}
 51%|████████████████████████████████████████████                                           | 887/1751 [14:51:24<14:33:51, 60.69s/it] 51%|████████████████████████████████████████████                                           | 888/1751 [14:52:25<14:34:29, 60.80s/it]                                                                                                                                     {'loss': '0.529', 'grad_norm': '0.1768', 'learning_rate': '1.06e-05', 'ppl': '1.697', 'memory/max_active (GiB)': '72.7', 'memory/max_allocated (GiB)': '72.7', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '171.3', 'tokens/total': 1686847104, 'tokens/trainable': 623346560, 'epoch': '1.521'}
 51%|████████████████████████████████████████████                                           | 888/1751 [14:52:25<14:34:29, 60.80s/it] 51%|████████████████████████████████████████████▏                                          | 889/1751 [14:53:24<14:24:49, 60.20s/it]                                                                                                                                     {'loss': '0.5082', 'grad_norm': '0.1914', 'learning_rate': '1.058e-05', 'ppl': '1.662', 'memory/max_active (GiB)': '74.54', 'memory/max_allocated (GiB)': '74.54', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '40.15', 'tokens/total': 1688666880, 'tokens/trainable': 624015936, 'epoch': '1.523'}
 51%|████████████████████████████████████████████▏                                          | 889/1751 [14:53:24<14:24:49, 60.20s/it] 51%|████████████████████████████████████████████▏                                          | 890/1751 [14:54:23<14:17:50, 59.78s/it]                                                                                                                                     {'loss': '0.5428', 'grad_norm': '0.1943', 'learning_rate': '1.057e-05', 'ppl': '1.721', 'memory/max_active (GiB)': '74.36', 'memory/max_allocated (GiB)': '74.36', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '60.92', 'tokens/total': 1690529152, 'tokens/trainable': 624716672, 'epoch': '1.524'}
 51%|████████████████████████████████████████████▏                                          | 890/1751 [14:54:23<14:17:50, 59.78s/it] 51%|████████████████████████████████████████████▎                                          | 891/1751 [14:55:20<14:03:36, 58.86s/it]                                                                                                                                     {'loss': '0.5526', 'grad_norm': '0.1934', 'learning_rate': '1.055e-05', 'ppl': '1.738', 'memory/max_active (GiB)': '76.88', 'memory/max_allocated (GiB)': '76.88', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '16.38', 'tokens/total': 1692272896, 'tokens/trainable': 625378560, 'epoch': '1.526'}
 51%|████████████████████████████████████████████▎                                          | 891/1751 [14:55:20<14:03:36, 58.86s/it] 51%|████████████████████████████████████████████▎                                          | 892/1751 [14:56:19<14:04:56, 59.02s/it]                                                                                                                                     {'loss': '0.5478', 'grad_norm': '0.1885', 'learning_rate': '1.053e-05', 'ppl': '1.729', 'memory/max_active (GiB)': '73.51', 'memory/max_allocated (GiB)': '73.51', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '89.64', 'tokens/total': 1694140672, 'tokens/trainable': 626063552, 'epoch': '1.528'}
 51%|████████████████████████████████████████████▎                                          | 892/1751 [14:56:19<14:04:56, 59.02s/it] 51%|████████████████████████████████████████████▎                                          | 893/1751 [14:57:18<14:02:59, 58.95s/it]                                                                                                                                     {'loss': '0.5156', 'grad_norm': '0.1816', 'learning_rate': '1.051e-05', 'ppl': '1.675', 'memory/max_active (GiB)': '75.89', 'memory/max_allocated (GiB)': '75.89', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '44.78', 'tokens/total': 1695996928, 'tokens/trainable': 626768320, 'epoch': '1.529'}
 51%|████████████████████████████████████████████▎                                          | 893/1751 [14:57:18<14:02:59, 58.95s/it] 51%|████████████████████████████████████████████▍                                          | 894/1751 [14:58:18<14:06:40, 59.28s/it]                                                                                                                                     {'loss': '0.5123', 'grad_norm': '0.1826', 'learning_rate': '1.049e-05', 'ppl': '1.669', 'memory/max_active (GiB)': '75.11', 'memory/max_allocated (GiB)': '75.11', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '125.5', 'tokens/total': 1697932800, 'tokens/trainable': 627480512, 'epoch': '1.531'}
 51%|████████████████████████████████████████████▍                                          | 894/1751 [14:58:18<14:06:40, 59.28s/it] 51%|████████████████████████████████████████████▍                                          | 895/1751 [14:59:18<14:07:49, 59.43s/it]                                                                                                                                     {'loss': '0.5086', 'grad_norm': '0.1777', 'learning_rate': '1.047e-05', 'ppl': '1.663', 'memory/max_active (GiB)': '74.74', 'memory/max_allocated (GiB)': '74.74', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '152.7', 'tokens/total': 1699790080, 'tokens/trainable': 628167360, 'epoch': '1.533'}
 51%|████████████████████████████████████████████▍                                          | 895/1751 [14:59:18<14:07:49, 59.43s/it] 51%|████████████████████████████████████████████▌                                          | 896/1751 [15:00:15<13:58:39, 58.85s/it]                                                                                                                                     {'loss': '0.544', 'grad_norm': '0.1943', 'learning_rate': '1.045e-05', 'ppl': '1.723', 'memory/max_active (GiB)': '74.23', 'memory/max_allocated (GiB)': '74.23', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '98.38', 'tokens/total': 1701597952, 'tokens/trainable': 628821888, 'epoch': '1.535'}
 51%|████████████████████████████████████████████▌                                          | 896/1751 [15:00:15<13:58:39, 58.85s/it] 51%|████████████████████████████████████████████▌                                          | 897/1751 [15:01:13<13:52:45, 58.51s/it]                                                                                                                                     {'loss': '0.5475', 'grad_norm': '0.2041', 'learning_rate': '1.043e-05', 'ppl': '1.729', 'memory/max_active (GiB)': '74.26', 'memory/max_allocated (GiB)': '74.26', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '204.9', 'tokens/total': 1703362816, 'tokens/trainable': 629423360, 'epoch': '1.536'}
 51%|████████████████████████████████████████████▌                                          | 897/1751 [15:01:13<13:52:45, 58.51s/it] 51%|████████████████████████████████████████████▌                                          | 898/1751 [15:02:13<13:57:29, 58.91s/it]                                                                                                                                     {'loss': '0.5462', 'grad_norm': '0.1943', 'learning_rate': '1.042e-05', 'ppl': '1.727', 'memory/max_active (GiB)': '74.06', 'memory/max_allocated (GiB)': '74.06', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '40.11', 'tokens/total': 1705213312, 'tokens/trainable': 630080640, 'epoch': '1.538'}
 51%|████████████████████████████████████████████▌                                          | 898/1751 [15:02:13<13:57:29, 58.91s/it] 51%|████████████████████████████████████████████▋                                          | 899/1751 [15:03:16<14:14:53, 60.20s/it]                                                                                                                                     {'loss': '0.5011', 'grad_norm': '0.1914', 'learning_rate': '1.04e-05', 'ppl': '1.651', 'memory/max_active (GiB)': '72.48', 'memory/max_allocated (GiB)': '72.48', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '98.36', 'tokens/total': 1707250944, 'tokens/trainable': 630834176, 'epoch': '1.54'}
 51%|████████████████████████████████████████████▋                                          | 899/1751 [15:03:16<14:14:53, 60.20s/it] 51%|████████████████████████████████████████████▋                                          | 900/1751 [15:04:17<14:17:26, 60.45s/it]                                                                                                                                     {'loss': '0.4906', 'grad_norm': '0.1836', 'learning_rate': '1.038e-05', 'ppl': '1.633', 'memory/max_active (GiB)': '72.22', 'memory/max_allocated (GiB)': '72.22', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '191.4', 'tokens/total': 1709190528, 'tokens/trainable': 631545792, 'epoch': '1.541'}
 51%|████████████████████████████████████████████▋                                          | 900/1751 [15:04:17<14:17:26, 60.45s/it] 51%|████████████████████████████████████████████▊                                          | 901/1751 [15:05:16<14:10:36, 60.04s/it]                                                                                                                                     {'loss': '0.5426', 'grad_norm': '0.1875', 'learning_rate': '1.036e-05', 'ppl': '1.72', 'memory/max_active (GiB)': '73.54', 'memory/max_allocated (GiB)': '73.54', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '24.86', 'tokens/total': 1711001600, 'tokens/trainable': 632219968, 'epoch': '1.543'}
 51%|████████████████████████████████████████████▊                                          | 901/1751 [15:05:16<14:10:36, 60.04s/it] 52%|████████████████████████████████████████████▊                                          | 902/1751 [15:06:15<14:04:00, 59.65s/it]                                                                                                                                     {'loss': '0.5217', 'grad_norm': '0.1875', 'learning_rate': '1.034e-05', 'ppl': '1.685', 'memory/max_active (GiB)': '69.17', 'memory/max_allocated (GiB)': '69.17', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '89.52', 'tokens/total': 1712861696, 'tokens/trainable': 632911296, 'epoch': '1.545'}
 52%|████████████████████████████████████████████▊                                          | 902/1751 [15:06:15<14:04:00, 59.65s/it] 52%|████████████████████████████████████████████▊                                          | 903/1751 [15:07:16<14:09:16, 60.09s/it]                                                                                                                                     {'loss': '0.5334', 'grad_norm': '0.1943', 'learning_rate': '1.032e-05', 'ppl': '1.705', 'memory/max_active (GiB)': '75.8', 'memory/max_allocated (GiB)': '75.8', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '39.31', 'tokens/total': 1714788608, 'tokens/trainable': 633634560, 'epoch': '1.547'}
 52%|████████████████████████████████████████████▊                                          | 903/1751 [15:07:16<14:09:16, 60.09s/it] 52%|████████████████████████████████████████████▉                                          | 904/1751 [15:08:16<14:07:47, 60.06s/it]                                                                                                                                     {'loss': '0.5103', 'grad_norm': '0.1826', 'learning_rate': '1.03e-05', 'ppl': '1.666', 'memory/max_active (GiB)': '75.05', 'memory/max_allocated (GiB)': '75.05', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '76.42', 'tokens/total': 1716689024, 'tokens/trainable': 634327296, 'epoch': '1.548'}
 52%|████████████████████████████████████████████▉                                          | 904/1751 [15:08:16<14:07:47, 60.06s/it] 52%|████████████████████████████████████████████▉                                          | 905/1751 [15:09:14<13:59:47, 59.56s/it]                                                                                                                                     {'loss': '0.5003', 'grad_norm': '0.1875', 'learning_rate': '1.028e-05', 'ppl': '1.649', 'memory/max_active (GiB)': '76.39', 'memory/max_allocated (GiB)': '76.39', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '117.8', 'tokens/total': 1718498304, 'tokens/trainable': 635005824, 'epoch': '1.55'}
 52%|████████████████████████████████████████████▉                                          | 905/1751 [15:09:14<13:59:47, 59.56s/it] 52%|█████████████████████████████████████████████                                          | 906/1751 [15:10:14<13:58:53, 59.57s/it]                                                                                                                                     {'loss': '0.5302', 'grad_norm': '0.1934', 'learning_rate': '1.026e-05', 'ppl': '1.699', 'memory/max_active (GiB)': '73.84', 'memory/max_allocated (GiB)': '73.84', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '88.61', 'tokens/total': 1720357248, 'tokens/trainable': 635690368, 'epoch': '1.552'}
 52%|█████████████████████████████████████████████                                          | 906/1751 [15:10:14<13:58:53, 59.57s/it] 52%|█████████████████████████████████████████████                                          | 907/1751 [15:11:11<13:46:32, 58.76s/it]                                                                                                                                     {'loss': '0.5449', 'grad_norm': '0.2021', 'learning_rate': '1.025e-05', 'ppl': '1.724', 'memory/max_active (GiB)': '73.96', 'memory/max_allocated (GiB)': '73.96', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '89.18', 'tokens/total': 1722122496, 'tokens/trainable': 636316800, 'epoch': '1.553'}
 52%|█████████████████████████████████████████████                                          | 907/1751 [15:11:11<13:46:32, 58.76s/it] 52%|█████████████████████████████████████████████                                          | 908/1751 [15:12:11<13:53:25, 59.32s/it]                                                                                                                                     {'loss': '0.4917', 'grad_norm': '0.1914', 'learning_rate': '1.023e-05', 'ppl': '1.635', 'memory/max_active (GiB)': '76.07', 'memory/max_allocated (GiB)': '76.07', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '47.75', 'tokens/total': 1724049024, 'tokens/trainable': 637042432, 'epoch': '1.555'}
 52%|█████████████████████████████████████████████                                          | 908/1751 [15:12:11<13:53:25, 59.32s/it] 52%|█████████████████████████████████████████████▏                                         | 909/1751 [15:13:11<13:52:35, 59.33s/it]                                                                                                                                     {'loss': '0.5589', 'grad_norm': '0.1953', 'learning_rate': '1.021e-05', 'ppl': '1.749', 'memory/max_active (GiB)': '73.53', 'memory/max_allocated (GiB)': '73.53', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '156.5', 'tokens/total': 1725851136, 'tokens/trainable': 637724096, 'epoch': '1.557'}
 52%|█████████████████████████████████████████████▏                                         | 909/1751 [15:13:11<13:52:35, 59.33s/it] 52%|█████████████████████████████████████████████▏                                         | 910/1751 [15:14:14<14:06:55, 60.42s/it]                                                                                                                                     {'loss': '0.4719', 'grad_norm': '0.1836', 'learning_rate': '1.019e-05', 'ppl': '1.603', 'memory/max_active (GiB)': '72.41', 'memory/max_allocated (GiB)': '72.41', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '77.72', 'tokens/total': 1727869184, 'tokens/trainable': 638453888, 'epoch': '1.559'}
 52%|█████████████████████████████████████████████▏                                         | 910/1751 [15:14:14<14:06:55, 60.42s/it] 52%|█████████████████████████████████████████████▎                                         | 911/1751 [15:15:16<14:15:15, 61.09s/it]                                                                                                                                     {'loss': '0.4652', 'grad_norm': '0.1885', 'learning_rate': '1.017e-05', 'ppl': '1.592', 'memory/max_active (GiB)': '73.86', 'memory/max_allocated (GiB)': '73.86', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '97.5', 'tokens/total': 1729869824, 'tokens/trainable': 639174272, 'epoch': '1.56'}
 52%|█████████████████████████████████████████████▎                                         | 911/1751 [15:15:16<14:15:15, 61.09s/it] 52%|█████████████████████████████████████████████▎                                         | 912/1751 [15:16:17<14:11:08, 60.87s/it]                                                                                                                                     {'loss': '0.5304', 'grad_norm': '0.1914', 'learning_rate': '1.015e-05', 'ppl': '1.7', 'memory/max_active (GiB)': '69.67', 'memory/max_allocated (GiB)': '69.67', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '52.98', 'tokens/total': 1731776128, 'tokens/trainable': 639877696, 'epoch': '1.562'}
 52%|█████████████████████████████████████████████▎                                         | 912/1751 [15:16:17<14:11:08, 60.87s/it] 52%|█████████████████████████████████████████████▎                                         | 913/1751 [15:17:16<14:04:58, 60.50s/it]                                                                                                                                     {'loss': '0.5386', 'grad_norm': '0.1875', 'learning_rate': '1.013e-05', 'ppl': '1.714', 'memory/max_active (GiB)': '73.17', 'memory/max_allocated (GiB)': '73.17', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '88.51', 'tokens/total': 1733632512, 'tokens/trainable': 640562368, 'epoch': '1.564'}
 52%|█████████████████████████████████████████████▎                                         | 913/1751 [15:17:16<14:04:58, 60.50s/it] 52%|█████████████████████████████████████████████▍                                         | 914/1751 [15:18:16<14:00:15, 60.23s/it]                                                                                                                                     {'loss': '0.5015', 'grad_norm': '0.1904', 'learning_rate': '1.011e-05', 'ppl': '1.651', 'memory/max_active (GiB)': '74.46', 'memory/max_allocated (GiB)': '74.46', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '28.39', 'tokens/total': 1735524096, 'tokens/trainable': 641236672, 'epoch': '1.565'}
 52%|█████████████████████████████████████████████▍                                         | 914/1751 [15:18:16<14:00:15, 60.23s/it] 52%|█████████████████████████████████████████████▍                                         | 915/1751 [15:19:16<13:59:17, 60.24s/it]                                                                                                                                     {'loss': '0.5182', 'grad_norm': '0.1846', 'learning_rate': '1.009e-05', 'ppl': '1.679', 'memory/max_active (GiB)': '74.19', 'memory/max_allocated (GiB)': '74.19', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '47.9', 'tokens/total': 1737365632, 'tokens/trainable': 641900416, 'epoch': '1.567'}
 52%|█████████████████████████████████████████████▍                                         | 915/1751 [15:19:16<13:59:17, 60.24s/it] 52%|█████████████████████████████████████████████▌                                         | 916/1751 [15:20:17<13:59:42, 60.34s/it]                                                                                                                                     {'loss': '0.5287', 'grad_norm': '0.1797', 'learning_rate': '1.008e-05', 'ppl': '1.697', 'memory/max_active (GiB)': '73.64', 'memory/max_allocated (GiB)': '73.64', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '81.02', 'tokens/total': 1739258496, 'tokens/trainable': 642616512, 'epoch': '1.569'}
 52%|█████████████████████████████████████████████▌                                         | 916/1751 [15:20:17<13:59:42, 60.34s/it] 52%|█████████████████████████████████████████████▌                                         | 917/1751 [15:21:17<14:00:37, 60.48s/it]                                                                                                                                     {'loss': '0.5247', 'grad_norm': '0.1953', 'learning_rate': '1.006e-05', 'ppl': '1.69', 'memory/max_active (GiB)': '74.49', 'memory/max_allocated (GiB)': '74.49', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '29.83', 'tokens/total': 1741115136, 'tokens/trainable': 643316160, 'epoch': '1.571'}
 52%|█████████████████████████████████████████████▌                                         | 917/1751 [15:21:17<14:00:37, 60.48s/it] 52%|█████████████████████████████████████████████▌                                         | 918/1751 [15:22:16<13:53:18, 60.02s/it]                                                                                                                                     {'loss': '0.5447', 'grad_norm': '0.1953', 'learning_rate': '1.004e-05', 'ppl': '1.724', 'memory/max_active (GiB)': '73.3', 'memory/max_allocated (GiB)': '73.3', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '48.73', 'tokens/total': 1742922496, 'tokens/trainable': 643967552, 'epoch': '1.572'}
 52%|█████████████████████████████████████████████▌                                         | 918/1751 [15:22:16<13:53:18, 60.02s/it] 52%|█████████████████████████████████████████████▋                                         | 919/1751 [15:23:17<13:55:49, 60.28s/it]                                                                                                                                     {'loss': '0.4978', 'grad_norm': '0.1768', 'learning_rate': '1.002e-05', 'ppl': '1.645', 'memory/max_active (GiB)': '74.15', 'memory/max_allocated (GiB)': '74.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '75.97', 'tokens/total': 1744795648, 'tokens/trainable': 644653632, 'epoch': '1.574'}
 52%|█████████████████████████████████████████████▋                                         | 919/1751 [15:23:17<13:55:49, 60.28s/it] 53%|█████████████████████████████████████████████▋                                         | 920/1751 [15:24:18<13:56:09, 60.37s/it]                                                                                                                                     {'loss': '0.5204', 'grad_norm': '0.1797', 'learning_rate': '1e-05', 'ppl': '1.683', 'memory/max_active (GiB)': '75.63', 'memory/max_allocated (GiB)': '75.63', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '67.37', 'tokens/total': 1746691072, 'tokens/trainable': 645375360, 'epoch': '1.576'}
 53%|█████████████████████████████████████████████▋                                         | 920/1751 [15:24:18<13:56:09, 60.37s/it] 53%|█████████████████████████████████████████████▊                                         | 921/1751 [15:25:19<13:57:53, 60.57s/it]                                                                                                                                     {'loss': '0.5285', 'grad_norm': '0.1855', 'learning_rate': '9.981e-06', 'ppl': '1.696', 'memory/max_active (GiB)': '78.54', 'memory/max_allocated (GiB)': '78.54', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '82.87', 'tokens/total': 1748612736, 'tokens/trainable': 646105472, 'epoch': '1.577'}
 53%|█████████████████████████████████████████████▊                                         | 921/1751 [15:25:19<13:57:53, 60.57s/it] 53%|█████████████████████████████████████████████▊                                         | 922/1751 [15:26:19<13:55:09, 60.45s/it]                                                                                                                                     {'loss': '0.4826', 'grad_norm': '0.1768', 'learning_rate': '9.962e-06', 'ppl': '1.62', 'memory/max_active (GiB)': '76.46', 'memory/max_allocated (GiB)': '76.46', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '63.77', 'tokens/total': 1750537472, 'tokens/trainable': 646804032, 'epoch': '1.579'}
 53%|█████████████████████████████████████████████▊                                         | 922/1751 [15:26:19<13:55:09, 60.45s/it] 53%|█████████████████████████████████████████████▊                                         | 923/1751 [15:27:15<13:37:07, 59.21s/it]                                                                                                                                     {'loss': '0.549', 'grad_norm': '0.1982', 'learning_rate': '9.943e-06', 'ppl': '1.731', 'memory/max_active (GiB)': '70.38', 'memory/max_allocated (GiB)': '70.38', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '42.62', 'tokens/total': 1752269184, 'tokens/trainable': 647431680, 'epoch': '1.581'}
 53%|█████████████████████████████████████████████▊                                         | 923/1751 [15:27:15<13:37:07, 59.21s/it] 53%|█████████████████████████████████████████████▉                                         | 924/1751 [15:28:15<13:38:31, 59.39s/it]                                                                                                                                     {'loss': '0.52', 'grad_norm': '0.1787', 'learning_rate': '9.924e-06', 'ppl': '1.682', 'memory/max_active (GiB)': '73.96', 'memory/max_allocated (GiB)': '73.96', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '63.32', 'tokens/total': 1754157952, 'tokens/trainable': 648145728, 'epoch': '1.583'}
 53%|█████████████████████████████████████████████▉                                         | 924/1751 [15:28:15<13:38:31, 59.39s/it] 53%|█████████████████████████████████████████████▉                                         | 925/1751 [15:29:18<13:49:47, 60.28s/it]                                                                                                                                     {'loss': '0.5315', 'grad_norm': '0.1777', 'learning_rate': '9.906e-06', 'ppl': '1.701', 'memory/max_active (GiB)': '76.82', 'memory/max_allocated (GiB)': '76.82', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '90.41', 'tokens/total': 1756114816, 'tokens/trainable': 648878976, 'epoch': '1.584'}
 53%|█████████████████████████████████████████████▉                                         | 925/1751 [15:29:18<13:49:47, 60.28s/it] 53%|██████████████████████████████████████████████                                         | 926/1751 [15:30:18<13:49:50, 60.35s/it]                                                                                                                                     {'loss': '0.4928', 'grad_norm': '0.1748', 'learning_rate': '9.887e-06', 'ppl': '1.637', 'memory/max_active (GiB)': '74.6', 'memory/max_allocated (GiB)': '74.6', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '57.09', 'tokens/total': 1758027264, 'tokens/trainable': 649610944, 'epoch': '1.586'}
 53%|██████████████████████████████████████████████                                         | 926/1751 [15:30:18<13:49:50, 60.35s/it] 53%|██████████████████████████████████████████████                                         | 927/1751 [15:31:20<13:55:08, 60.81s/it]                                                                                                                                     {'loss': '0.5292', 'grad_norm': '0.1807', 'learning_rate': '9.868e-06', 'ppl': '1.698', 'memory/max_active (GiB)': '75.14', 'memory/max_allocated (GiB)': '75.14', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '52.87', 'tokens/total': 1760012800, 'tokens/trainable': 650362304, 'epoch': '1.588'}
 53%|██████████████████████████████████████████████                                         | 927/1751 [15:31:20<13:55:08, 60.81s/it] 53%|██████████████████████████████████████████████                                         | 928/1751 [15:32:24<14:05:42, 61.66s/it]                                                                                                                                     {'loss': '0.4913', 'grad_norm': '0.1699', 'learning_rate': '9.849e-06', 'ppl': '1.634', 'memory/max_active (GiB)': '71.87', 'memory/max_allocated (GiB)': '71.87', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '145.4', 'tokens/total': 1762053760, 'tokens/trainable': 651139776, 'epoch': '1.589'}
 53%|██████████████████████████████████████████████                                         | 928/1751 [15:32:24<14:05:42, 61.66s/it] 53%|██████████████████████████████████████████████▏                                        | 929/1751 [15:33:24<13:59:50, 61.30s/it]                                                                                                                                     {'loss': '0.5078', 'grad_norm': '0.1904', 'learning_rate': '9.83e-06', 'ppl': '1.662', 'memory/max_active (GiB)': '74.22', 'memory/max_allocated (GiB)': '74.22', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '33.06', 'tokens/total': 1763973632, 'tokens/trainable': 651849472, 'epoch': '1.591'}
 53%|██████████████████████████████████████████████▏                                        | 929/1751 [15:33:24<13:59:50, 61.30s/it] 53%|██████████████████████████████████████████████▏                                        | 930/1751 [15:34:26<13:59:50, 61.38s/it]                                                                                                                                     {'loss': '0.5477', 'grad_norm': '0.1875', 'learning_rate': '9.811e-06', 'ppl': '1.729', 'memory/max_active (GiB)': '76.43', 'memory/max_allocated (GiB)': '76.43', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '80.61', 'tokens/total': 1765900800, 'tokens/trainable': 652582976, 'epoch': '1.593'}
 53%|██████████████████████████████████████████████▏                                        | 930/1751 [15:34:26<13:59:50, 61.38s/it] 53%|██████████████████████████████████████████████▎                                        | 931/1751 [15:35:24<13:47:08, 60.52s/it]                                                                                                                                     {'loss': '0.554', 'grad_norm': '0.1914', 'learning_rate': '9.792e-06', 'ppl': '1.74', 'memory/max_active (GiB)': '76.35', 'memory/max_allocated (GiB)': '76.35', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '86.13', 'tokens/total': 1767782016, 'tokens/trainable': 653280896, 'epoch': '1.595'}
 53%|██████████████████████████████████████████████▎                                        | 931/1751 [15:35:24<13:47:08, 60.52s/it] 53%|██████████████████████████████████████████████▎                                        | 932/1751 [15:36:26<13:49:56, 60.80s/it]                                                                                                                                     {'loss': '0.5454', 'grad_norm': '0.1768', 'learning_rate': '9.773e-06', 'ppl': '1.725', 'memory/max_active (GiB)': '70.22', 'memory/max_allocated (GiB)': '70.22', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '78.01', 'tokens/total': 1769729664, 'tokens/trainable': 654008256, 'epoch': '1.596'}
 53%|██████████████████████████████████████████████▎                                        | 932/1751 [15:36:26<13:49:56, 60.80s/it] 53%|██████████████████████████████████████████████▎                                        | 933/1751 [15:37:26<13:47:49, 60.72s/it]                                                                                                                                     {'loss': '0.4953', 'grad_norm': '0.1855', 'learning_rate': '9.755e-06', 'ppl': '1.641', 'memory/max_active (GiB)': '73.99', 'memory/max_allocated (GiB)': '73.99', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '30.09', 'tokens/total': 1771637632, 'tokens/trainable': 654703488, 'epoch': '1.598'}
 53%|██████████████████████████████████████████████▎                                        | 933/1751 [15:37:26<13:47:49, 60.72s/it] 53%|██████████████████████████████████████████████▍                                        | 934/1751 [15:38:28<13:49:23, 60.91s/it]                                                                                                                                     {'loss': '0.5237', 'grad_norm': '0.1836', 'learning_rate': '9.736e-06', 'ppl': '1.688', 'memory/max_active (GiB)': '76.68', 'memory/max_allocated (GiB)': '76.68', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '58.78', 'tokens/total': 1773588352, 'tokens/trainable': 655415872, 'epoch': '1.6'}
 53%|██████████████████████████████████████████████▍                                        | 934/1751 [15:38:28<13:49:23, 60.91s/it] 53%|██████████████████████████████████████████████▍                                        | 935/1751 [15:39:25<13:35:35, 59.97s/it]                                                                                                                                     {'loss': '0.5307', 'grad_norm': '0.1943', 'learning_rate': '9.717e-06', 'ppl': '1.7', 'memory/max_active (GiB)': '73.34', 'memory/max_allocated (GiB)': '73.34', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '143.8', 'tokens/total': 1775434752, 'tokens/trainable': 656099456, 'epoch': '1.601'}
 53%|██████████████████████████████████████████████▍                                        | 935/1751 [15:39:25<13:35:35, 59.97s/it] 53%|██████████████████████████████████████████████▌                                        | 936/1751 [15:40:25<13:34:08, 59.94s/it]                                                                                                                                     {'loss': '0.4923', 'grad_norm': '0.1816', 'learning_rate': '9.698e-06', 'ppl': '1.636', 'memory/max_active (GiB)': '76.3', 'memory/max_allocated (GiB)': '76.3', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '24.73', 'tokens/total': 1777355136, 'tokens/trainable': 656792448, 'epoch': '1.603'}
 53%|██████████████████████████████████████████████▌                                        | 936/1751 [15:40:25<13:34:08, 59.94s/it] 54%|██████████████████████████████████████████████▌                                        | 937/1751 [15:41:24<13:28:31, 59.60s/it]                                                                                                                                     {'loss': '0.5268', 'grad_norm': '0.1904', 'learning_rate': '9.679e-06', 'ppl': '1.694', 'memory/max_active (GiB)': '74.59', 'memory/max_allocated (GiB)': '74.59', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '122', 'tokens/total': 1779187200, 'tokens/trainable': 657471296, 'epoch': '1.605'}
 54%|██████████████████████████████████████████████▌                                        | 937/1751 [15:41:24<13:28:31, 59.60s/it] 54%|██████████████████████████████████████████████▌                                        | 938/1751 [15:42:24<13:28:39, 59.68s/it]                                                                                                                                     {'loss': '0.5045', 'grad_norm': '0.1768', 'learning_rate': '9.66e-06', 'ppl': '1.656', 'memory/max_active (GiB)': '75.53', 'memory/max_allocated (GiB)': '75.53', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '40.46', 'tokens/total': 1781052160, 'tokens/trainable': 658152704, 'epoch': '1.607'}
 54%|██████████████████████████████████████████████▌                                        | 938/1751 [15:42:24<13:28:39, 59.68s/it] 54%|██████████████████████████████████████████████▋                                        | 939/1751 [15:43:22<13:22:37, 59.31s/it]                                                                                                                                     {'loss': '0.5172', 'grad_norm': '0.1855', 'learning_rate': '9.641e-06', 'ppl': '1.677', 'memory/max_active (GiB)': '66.21', 'memory/max_allocated (GiB)': '66.21', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '123.5', 'tokens/total': 1782897920, 'tokens/trainable': 658809408, 'epoch': '1.608'}
 54%|██████████████████████████████████████████████▋                                        | 939/1751 [15:43:22<13:22:37, 59.31s/it] 54%|██████████████████████████████████████████████▋                                        | 940/1751 [15:44:20<13:15:37, 58.86s/it]                                                                                                                                     {'loss': '0.5737', 'grad_norm': '0.1934', 'learning_rate': '9.622e-06', 'ppl': '1.775', 'memory/max_active (GiB)': '74.3', 'memory/max_allocated (GiB)': '74.3', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '100.9', 'tokens/total': 1784718208, 'tokens/trainable': 659475264, 'epoch': '1.61'}
 54%|██████████████████████████████████████████████▋                                        | 940/1751 [15:44:20<13:15:37, 58.86s/it] 54%|██████████████████████████████████████████████▊                                        | 941/1751 [15:45:23<13:29:09, 59.94s/it]                                                                                                                                     {'loss': '0.5097', 'grad_norm': '0.1758', 'learning_rate': '9.604e-06', 'ppl': '1.665', 'memory/max_active (GiB)': '75.91', 'memory/max_allocated (GiB)': '75.91', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '176.7', 'tokens/total': 1786701056, 'tokens/trainable': 660206272, 'epoch': '1.612'}
 54%|██████████████████████████████████████████████▊                                        | 941/1751 [15:45:23<13:29:09, 59.94s/it] 54%|██████████████████████████████████████████████▊                                        | 942/1751 [15:46:21<13:20:33, 59.37s/it]                                                                                                                                     {'loss': '0.5582', 'grad_norm': '0.2051', 'learning_rate': '9.585e-06', 'ppl': '1.748', 'memory/max_active (GiB)': '75.92', 'memory/max_allocated (GiB)': '75.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '59.98', 'tokens/total': 1788557184, 'tokens/trainable': 660885952, 'epoch': '1.613'}
 54%|██████████████████████████████████████████████▊                                        | 942/1751 [15:46:21<13:20:33, 59.37s/it] 54%|██████████████████████████████████████████████▊                                        | 943/1751 [15:47:19<13:16:53, 59.18s/it]                                                                                                                                     {'loss': '0.5305', 'grad_norm': '0.1943', 'learning_rate': '9.566e-06', 'ppl': '1.7', 'memory/max_active (GiB)': '76.42', 'memory/max_allocated (GiB)': '76.42', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '143.2', 'tokens/total': 1790414592, 'tokens/trainable': 661572928, 'epoch': '1.615'}
 54%|██████████████████████████████████████████████▊                                        | 943/1751 [15:47:19<13:16:53, 59.18s/it] 54%|██████████████████████████████████████████████▉                                        | 944/1751 [15:48:21<13:27:08, 60.01s/it]                                                                                                                                     {'loss': '0.4822', 'grad_norm': '0.1807', 'learning_rate': '9.547e-06', 'ppl': '1.62', 'memory/max_active (GiB)': '77.16', 'memory/max_allocated (GiB)': '77.16', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '86.71', 'tokens/total': 1792400000, 'tokens/trainable': 662312320, 'epoch': '1.617'}
 54%|██████████████████████████████████████████████▉                                        | 944/1751 [15:48:21<13:27:08, 60.01s/it] 54%|██████████████████████████████████████████████▉                                        | 945/1751 [15:49:23<13:32:12, 60.46s/it]                                                                                                                                     {'loss': '0.5137', 'grad_norm': '0.1816', 'learning_rate': '9.528e-06', 'ppl': '1.671', 'memory/max_active (GiB)': '76.7', 'memory/max_allocated (GiB)': '76.7', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '50.84', 'tokens/total': 1794367488, 'tokens/trainable': 663029248, 'epoch': '1.619'}
 54%|██████████████████████████████████████████████▉                                        | 945/1751 [15:49:23<13:32:12, 60.46s/it] 54%|███████████████████████████████████████████████                                        | 946/1751 [15:50:25<13:37:11, 60.91s/it]                                                                                                                                     {'loss': '0.5303', 'grad_norm': '0.1875', 'learning_rate': '9.509e-06', 'ppl': '1.699', 'memory/max_active (GiB)': '73.64', 'memory/max_allocated (GiB)': '73.64', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '55.72', 'tokens/total': 1796289536, 'tokens/trainable': 663748416, 'epoch': '1.62'}
 54%|███████████████████████████████████████████████                                        | 946/1751 [15:50:25<13:37:11, 60.91s/it] 54%|███████████████████████████████████████████████                                        | 947/1751 [15:51:25<13:31:40, 60.57s/it]                                                                                                                                     {'loss': '0.5326', 'grad_norm': '0.1875', 'learning_rate': '9.49e-06', 'ppl': '1.703', 'memory/max_active (GiB)': '68.23', 'memory/max_allocated (GiB)': '68.23', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '63.43', 'tokens/total': 1798151936, 'tokens/trainable': 664437056, 'epoch': '1.622'}
 54%|███████████████████████████████████████████████                                        | 947/1751 [15:51:25<13:31:40, 60.57s/it] 54%|███████████████████████████████████████████████                                        | 948/1751 [15:52:23<13:23:17, 60.02s/it]                                                                                                                                     {'loss': '0.5014', 'grad_norm': '0.1875', 'learning_rate': '9.472e-06', 'ppl': '1.651', 'memory/max_active (GiB)': '77.47', 'memory/max_allocated (GiB)': '77.47', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '54.1', 'tokens/total': 1799976448, 'tokens/trainable': 665084416, 'epoch': '1.624'}
 54%|███████████████████████████████████████████████                                        | 948/1751 [15:52:23<13:23:17, 60.02s/it] 54%|███████████████████████████████████████████████▏                                       | 949/1751 [15:53:23<13:22:04, 60.01s/it]                                                                                                                                     {'loss': '0.55', 'grad_norm': '0.1865', 'learning_rate': '9.453e-06', 'ppl': '1.733', 'memory/max_active (GiB)': '74.39', 'memory/max_allocated (GiB)': '74.39', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '148.3', 'tokens/total': 1801838336, 'tokens/trainable': 665800320, 'epoch': '1.625'}
 54%|███████████████████████████████████████████████▏                                       | 949/1751 [15:53:23<13:22:04, 60.01s/it] 54%|███████████████████████████████████████████████▏                                       | 950/1751 [15:54:24<13:24:38, 60.27s/it]                                                                                                                                     {'loss': '0.5256', 'grad_norm': '0.1914', 'learning_rate': '9.434e-06', 'ppl': '1.691', 'memory/max_active (GiB)': '72.41', 'memory/max_allocated (GiB)': '72.41', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '62.56', 'tokens/total': 1803779840, 'tokens/trainable': 666509376, 'epoch': '1.627'}
 54%|███████████████████████████████████████████████▏                                       | 950/1751 [15:54:24<13:24:38, 60.27s/it] 54%|███████████████████████████████████████████████▎                                       | 951/1751 [15:55:22<13:14:07, 59.56s/it]                                                                                                                                     {'loss': '0.5274', 'grad_norm': '0.1924', 'learning_rate': '9.415e-06', 'ppl': '1.695', 'memory/max_active (GiB)': '72.39', 'memory/max_allocated (GiB)': '72.39', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '138.1', 'tokens/total': 1805562496, 'tokens/trainable': 667163136, 'epoch': '1.629'}
 54%|███████████████████████████████████████████████▎                                       | 951/1751 [15:55:22<13:14:07, 59.56s/it] 54%|███████████████████████████████████████████████▎                                       | 952/1751 [15:56:20<13:08:28, 59.21s/it]                                                                                                                                     {'loss': '0.512', 'grad_norm': '0.1826', 'learning_rate': '9.396e-06', 'ppl': '1.669', 'memory/max_active (GiB)': '77.04', 'memory/max_allocated (GiB)': '77.04', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '108.2', 'tokens/total': 1807380352, 'tokens/trainable': 667863552, 'epoch': '1.631'}
 54%|███████████████████████████████████████████████▎                                       | 952/1751 [15:56:20<13:08:28, 59.21s/it] 54%|███████████████████████████████████████████████▎                                       | 953/1751 [15:57:22<13:16:13, 59.87s/it]                                                                                                                                     {'loss': '0.5092', 'grad_norm': '0.1797', 'learning_rate': '9.377e-06', 'ppl': '1.664', 'memory/max_active (GiB)': '74.36', 'memory/max_allocated (GiB)': '74.36', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '98.63', 'tokens/total': 1809341952, 'tokens/trainable': 668578176, 'epoch': '1.632'}
 54%|███████████████████████████████████████████████▎                                       | 953/1751 [15:57:22<13:16:13, 59.87s/it] 54%|███████████████████████████████████████████████▍                                       | 954/1751 [15:58:20<13:08:12, 59.34s/it]                                                                                                                                     {'loss': '0.5376', 'grad_norm': '0.1943', 'learning_rate': '9.359e-06', 'ppl': '1.712', 'memory/max_active (GiB)': '66.75', 'memory/max_allocated (GiB)': '66.75', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '156.2', 'tokens/total': 1811171072, 'tokens/trainable': 669275712, 'epoch': '1.634'}
 54%|███████████████████████████████████████████████▍                                       | 954/1751 [15:58:20<13:08:12, 59.34s/it] 55%|███████████████████████████████████████████████▍                                       | 955/1751 [15:59:19<13:04:15, 59.11s/it]                                                                                                                                     {'loss': '0.5214', 'grad_norm': '0.1904', 'learning_rate': '9.34e-06', 'ppl': '1.684', 'memory/max_active (GiB)': '68.3', 'memory/max_allocated (GiB)': '68.3', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '19.91', 'tokens/total': 1812997248, 'tokens/trainable': 669942336, 'epoch': '1.636'}
 55%|███████████████████████████████████████████████▍                                       | 955/1751 [15:59:19<13:04:15, 59.11s/it] 55%|███████████████████████████████████████████████▍                                       | 956/1751 [16:00:18<13:06:03, 59.32s/it]                                                                                                                                     {'loss': '0.5001', 'grad_norm': '0.1699', 'learning_rate': '9.321e-06', 'ppl': '1.649', 'memory/max_active (GiB)': '72.5', 'memory/max_allocated (GiB)': '72.5', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '153.6', 'tokens/total': 1814899456, 'tokens/trainable': 670637056, 'epoch': '1.637'}
 55%|███████████████████████████████████████████████▍                                       | 956/1751 [16:00:18<13:06:03, 59.32s/it] 55%|███████████████████████████████████████████████▌                                       | 957/1751 [16:01:19<13:12:16, 59.87s/it]                                                                                                                                     {'loss': '0.5097', 'grad_norm': '0.1777', 'learning_rate': '9.302e-06', 'ppl': '1.665', 'memory/max_active (GiB)': '75.54', 'memory/max_allocated (GiB)': '75.54', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '158', 'tokens/total': 1816860800, 'tokens/trainable': 671332672, 'epoch': '1.639'}
 55%|███████████████████████████████████████████████▌                                       | 957/1751 [16:01:19<13:12:16, 59.87s/it] 55%|███████████████████████████████████████████████▌                                       | 958/1751 [16:02:21<13:16:44, 60.28s/it]                                                                                                                                     {'loss': '0.536', 'grad_norm': '0.1953', 'learning_rate': '9.283e-06', 'ppl': '1.709', 'memory/max_active (GiB)': '73.87', 'memory/max_allocated (GiB)': '73.87', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '96.68', 'tokens/total': 1818795904, 'tokens/trainable': 672027712, 'epoch': '1.641'}
 55%|███████████████████████████████████████████████▌                                       | 958/1751 [16:02:21<13:16:44, 60.28s/it] 55%|███████████████████████████████████████████████▋                                       | 959/1751 [16:03:19<13:09:37, 59.82s/it]                                                                                                                                     {'loss': '0.5228', 'grad_norm': '0.1973', 'learning_rate': '9.264e-06', 'ppl': '1.687', 'memory/max_active (GiB)': '73.42', 'memory/max_allocated (GiB)': '73.42', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '89.25', 'tokens/total': 1820623232, 'tokens/trainable': 672714944, 'epoch': '1.643'}
 55%|███████████████████████████████████████████████▋                                       | 959/1751 [16:03:19<13:09:37, 59.82s/it] 55%|███████████████████████████████████████████████▋                                       | 960/1751 [16:04:21<13:14:40, 60.28s/it]                                                                                                                                     {'loss': '0.5122', 'grad_norm': '0.1719', 'learning_rate': '9.246e-06', 'ppl': '1.669', 'memory/max_active (GiB)': '77.05', 'memory/max_allocated (GiB)': '77.05', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '69.36', 'tokens/total': 1822570752, 'tokens/trainable': 673472896, 'epoch': '1.644'}
 55%|███████████████████████████████████████████████▋                                       | 960/1751 [16:04:21<13:14:40, 60.28s/it] 55%|███████████████████████████████████████████████▋                                       | 961/1751 [16:05:24<13:26:08, 61.23s/it]                                                                                                                                     {'loss': '0.5119', 'grad_norm': '0.1758', 'learning_rate': '9.227e-06', 'ppl': '1.668', 'memory/max_active (GiB)': '74.61', 'memory/max_allocated (GiB)': '74.61', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '194.2', 'tokens/total': 1824637824, 'tokens/trainable': 674230784, 'epoch': '1.646'}
 55%|███████████████████████████████████████████████▋                                       | 961/1751 [16:05:24<13:26:08, 61.23s/it] 55%|███████████████████████████████████████████████▊                                       | 962/1751 [16:06:24<13:20:04, 60.84s/it]                                                                                                                                     {'loss': '0.5681', 'grad_norm': '0.1924', 'learning_rate': '9.208e-06', 'ppl': '1.765', 'memory/max_active (GiB)': '73.92', 'memory/max_allocated (GiB)': '73.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '21.83', 'tokens/total': 1826481536, 'tokens/trainable': 674912576, 'epoch': '1.648'}
 55%|███████████████████████████████████████████████▊                                       | 962/1751 [16:06:24<13:20:04, 60.84s/it] 55%|███████████████████████████████████████████████▊                                       | 963/1751 [16:07:23<13:10:27, 60.19s/it]                                                                                                                                     {'loss': '0.5378', 'grad_norm': '0.1914', 'learning_rate': '9.189e-06', 'ppl': '1.712', 'memory/max_active (GiB)': '70.55', 'memory/max_allocated (GiB)': '70.55', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '22.88', 'tokens/total': 1828321664, 'tokens/trainable': 675575424, 'epoch': '1.649'}
 55%|███████████████████████████████████████████████▊                                       | 963/1751 [16:07:23<13:10:27, 60.19s/it] 55%|███████████████████████████████████████████████▉                                       | 964/1751 [16:08:23<13:10:07, 60.24s/it]                                                                                                                                     {'loss': '0.5161', 'grad_norm': '0.1865', 'learning_rate': '9.17e-06', 'ppl': '1.675', 'memory/max_active (GiB)': '74.39', 'memory/max_allocated (GiB)': '74.39', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '34.69', 'tokens/total': 1830189056, 'tokens/trainable': 676247424, 'epoch': '1.651'}
 55%|███████████████████████████████████████████████▉                                       | 964/1751 [16:08:23<13:10:07, 60.24s/it] 55%|███████████████████████████████████████████████▉                                       | 965/1751 [16:09:24<13:10:50, 60.37s/it]                                                                                                                                     {'loss': '0.5176', 'grad_norm': '0.1758', 'learning_rate': '9.151e-06', 'ppl': '1.678', 'memory/max_active (GiB)': '77.19', 'memory/max_allocated (GiB)': '77.19', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '133.3', 'tokens/total': 1832108544, 'tokens/trainable': 676941568, 'epoch': '1.653'}
 55%|███████████████████████████████████████████████▉                                       | 965/1751 [16:09:24<13:10:50, 60.37s/it] 55%|███████████████████████████████████████████████▉                                       | 966/1751 [16:10:23<13:06:47, 60.14s/it]                                                                                                                                     {'loss': '0.5206', 'grad_norm': '0.1816', 'learning_rate': '9.133e-06', 'ppl': '1.683', 'memory/max_active (GiB)': '75.51', 'memory/max_allocated (GiB)': '75.51', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '86.07', 'tokens/total': 1833991936, 'tokens/trainable': 677648320, 'epoch': '1.655'}
 55%|███████████████████████████████████████████████▉                                       | 966/1751 [16:10:23<13:06:47, 60.14s/it] 55%|████████████████████████████████████████████████                                       | 967/1751 [16:11:24<13:06:08, 60.16s/it]                                                                                                                                     {'loss': '0.5115', 'grad_norm': '0.1758', 'learning_rate': '9.114e-06', 'ppl': '1.668', 'memory/max_active (GiB)': '76.1', 'memory/max_allocated (GiB)': '76.1', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '40.1', 'tokens/total': 1835887488, 'tokens/trainable': 678363648, 'epoch': '1.656'}
 55%|████████████████████████████████████████████████                                       | 967/1751 [16:11:24<13:06:08, 60.16s/it] 55%|████████████████████████████████████████████████                                       | 968/1751 [16:12:23<13:02:07, 59.93s/it]                                                                                                                                     {'loss': '0.5003', 'grad_norm': '0.1865', 'learning_rate': '9.095e-06', 'ppl': '1.649', 'memory/max_active (GiB)': '75.43', 'memory/max_allocated (GiB)': '75.43', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '90.43', 'tokens/total': 1837732096, 'tokens/trainable': 679042752, 'epoch': '1.658'}
 55%|████████████████████████████████████████████████                                       | 968/1751 [16:12:23<13:02:07, 59.93s/it] 55%|████████████████████████████████████████████████▏                                      | 969/1751 [16:13:26<13:13:47, 60.90s/it]                                                                                                                                     {'loss': '0.5062', 'grad_norm': '0.1689', 'learning_rate': '9.076e-06', 'ppl': '1.659', 'memory/max_active (GiB)': '76.33', 'memory/max_allocated (GiB)': '76.33', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '169.3', 'tokens/total': 1839757824, 'tokens/trainable': 679818496, 'epoch': '1.66'}
 55%|████████████████████████████████████████████████▏                                      | 969/1751 [16:13:26<13:13:47, 60.90s/it] 55%|████████████████████████████████████████████████▏                                      | 970/1751 [16:14:24<13:01:59, 60.08s/it]                                                                                                                                     {'loss': '0.5351', 'grad_norm': '0.1885', 'learning_rate': '9.057e-06', 'ppl': '1.708', 'memory/max_active (GiB)': '71.29', 'memory/max_allocated (GiB)': '71.29', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '103.5', 'tokens/total': 1841520512, 'tokens/trainable': 680495104, 'epoch': '1.661'}
 55%|████████████████████████████████████████████████▏                                      | 970/1751 [16:14:24<13:01:59, 60.08s/it] 55%|████████████████████████████████████████████████▏                                      | 971/1751 [16:15:23<12:53:30, 59.50s/it]                                                                                                                                     {'loss': '0.5534', 'grad_norm': '0.1875', 'learning_rate': '9.039e-06', 'ppl': '1.739', 'memory/max_active (GiB)': '70.91', 'memory/max_allocated (GiB)': '70.91', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '60.32', 'tokens/total': 1843329920, 'tokens/trainable': 681182080, 'epoch': '1.663'}
 55%|████████████████████████████████████████████████▏                                      | 971/1751 [16:15:23<12:53:30, 59.50s/it] 56%|████████████████████████████████████████████████▎                                      | 972/1751 [16:16:23<12:57:11, 59.86s/it]                                                                                                                                     {'loss': '0.519', 'grad_norm': '0.1816', 'learning_rate': '9.02e-06', 'ppl': '1.68', 'memory/max_active (GiB)': '71.49', 'memory/max_allocated (GiB)': '71.49', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '92.07', 'tokens/total': 1845259904, 'tokens/trainable': 681895168, 'epoch': '1.665'}
 56%|████████████████████████████████████████████████▎                                      | 972/1751 [16:16:23<12:57:11, 59.86s/it] 56%|████████████████████████████████████████████████▎                                      | 973/1751 [16:17:23<12:56:30, 59.89s/it]                                                                                                                                     {'loss': '0.5125', 'grad_norm': '0.1768', 'learning_rate': '9.001e-06', 'ppl': '1.67', 'memory/max_active (GiB)': '73.51', 'memory/max_allocated (GiB)': '73.51', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '82.68', 'tokens/total': 1847157760, 'tokens/trainable': 682629952, 'epoch': '1.667'}
 56%|████████████████████████████████████████████████▎                                      | 973/1751 [16:17:23<12:56:30, 59.89s/it] 56%|████████████████████████████████████████████████▍                                      | 974/1751 [16:18:23<12:54:23, 59.80s/it]                                                                                                                                     {'loss': '0.5399', 'grad_norm': '0.1895', 'learning_rate': '8.982e-06', 'ppl': '1.716', 'memory/max_active (GiB)': '70.33', 'memory/max_allocated (GiB)': '70.33', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '106.2', 'tokens/total': 1849012736, 'tokens/trainable': 683324224, 'epoch': '1.668'}
 56%|████████████████████████████████████████████████▍                                      | 974/1751 [16:18:23<12:54:23, 59.80s/it] 56%|████████████████████████████████████████████████▍                                      | 975/1751 [16:19:21<12:45:47, 59.21s/it]                                                                                                                                     {'loss': '0.5302', 'grad_norm': '0.1924', 'learning_rate': '8.963e-06', 'ppl': '1.699', 'memory/max_active (GiB)': '69.06', 'memory/max_allocated (GiB)': '69.06', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '155.4', 'tokens/total': 1850822272, 'tokens/trainable': 683966656, 'epoch': '1.67'}
 56%|████████████████████████████████████████████████▍                                      | 975/1751 [16:19:21<12:45:47, 59.21s/it] 56%|████████████████████████████████████████████████▍                                      | 976/1751 [16:20:19<12:40:59, 58.92s/it]                                                                                                                                     {'loss': '0.5633', 'grad_norm': '0.1982', 'learning_rate': '8.945e-06', 'ppl': '1.756', 'memory/max_active (GiB)': '73.24', 'memory/max_allocated (GiB)': '73.24', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '73.44', 'tokens/total': 1852649472, 'tokens/trainable': 684643200, 'epoch': '1.672'}
 56%|████████████████████████████████████████████████▍                                      | 976/1751 [16:20:19<12:40:59, 58.92s/it] 56%|████████████████████████████████████████████████▌                                      | 977/1751 [16:21:18<12:40:04, 58.92s/it]                                                                                                                                     {'loss': '0.528', 'grad_norm': '0.1787', 'learning_rate': '8.926e-06', 'ppl': '1.695', 'memory/max_active (GiB)': '77.13', 'memory/max_allocated (GiB)': '77.13', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '114.9', 'tokens/total': 1854513408, 'tokens/trainable': 685353920, 'epoch': '1.673'}
 56%|████████████████████████████████████████████████▌                                      | 977/1751 [16:21:18<12:40:04, 58.92s/it] 56%|████████████████████████████████████████████████▌                                      | 978/1751 [16:22:18<12:45:32, 59.42s/it]                                                                                                                                     {'loss': '0.5227', 'grad_norm': '0.1729', 'learning_rate': '8.907e-06', 'ppl': '1.687', 'memory/max_active (GiB)': '71.73', 'memory/max_allocated (GiB)': '71.73', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '38.16', 'tokens/total': 1856414592, 'tokens/trainable': 686073600, 'epoch': '1.675'}
 56%|████████████████████████████████████████████████▌                                      | 978/1751 [16:22:18<12:45:32, 59.42s/it] 56%|████████████████████████████████████████████████▋                                      | 979/1751 [16:23:20<12:53:57, 60.15s/it]                                                                                                                                     {'loss': '0.5081', 'grad_norm': '0.1807', 'learning_rate': '8.888e-06', 'ppl': '1.662', 'memory/max_active (GiB)': '73.8', 'memory/max_allocated (GiB)': '73.8', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '49.72', 'tokens/total': 1858412416, 'tokens/trainable': 686797504, 'epoch': '1.677'}
 56%|████████████████████████████████████████████████▋                                      | 979/1751 [16:23:20<12:53:57, 60.15s/it] 56%|████████████████████████████████████████████████▋                                      | 980/1751 [16:24:18<12:44:07, 59.47s/it]                                                                                                                                     {'loss': '0.5221', 'grad_norm': '0.1787', 'learning_rate': '8.87e-06', 'ppl': '1.686', 'memory/max_active (GiB)': '72.21', 'memory/max_allocated (GiB)': '72.21', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '73.73', 'tokens/total': 1860257536, 'tokens/trainable': 687499520, 'epoch': '1.679'}
 56%|████████████████████████████████████████████████▋                                      | 980/1751 [16:24:18<12:44:07, 59.47s/it] 56%|████████████████████████████████████████████████▋                                      | 981/1751 [16:25:20<12:52:36, 60.20s/it]                                                                                                                                     {'loss': '0.5076', 'grad_norm': '0.1768', 'learning_rate': '8.851e-06', 'ppl': '1.661', 'memory/max_active (GiB)': '69.49', 'memory/max_allocated (GiB)': '69.49', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '34.83', 'tokens/total': 1862230016, 'tokens/trainable': 688238784, 'epoch': '1.68'}
 56%|████████████████████████████████████████████████▋                                      | 981/1751 [16:25:20<12:52:36, 60.20s/it] 56%|████████████████████████████████████████████████▊                                      | 982/1751 [16:26:21<12:54:14, 60.41s/it]                                                                                                                                     {'loss': '0.5302', 'grad_norm': '0.1914', 'learning_rate': '8.832e-06', 'ppl': '1.699', 'memory/max_active (GiB)': '74.37', 'memory/max_allocated (GiB)': '74.37', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '162.7', 'tokens/total': 1864142720, 'tokens/trainable': 688943232, 'epoch': '1.682'}
 56%|████████████████████████████████████████████████▊                                      | 982/1751 [16:26:21<12:54:14, 60.41s/it] 56%|████████████████████████████████████████████████▊                                      | 983/1751 [16:27:23<12:59:14, 60.88s/it]                                                                                                                                     {'loss': '0.5231', 'grad_norm': '0.1768', 'learning_rate': '8.813e-06', 'ppl': '1.687', 'memory/max_active (GiB)': '76.29', 'memory/max_allocated (GiB)': '76.29', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '28.84', 'tokens/total': 1866123264, 'tokens/trainable': 689674880, 'epoch': '1.684'}
 56%|████████████████████████████████████████████████▊                                      | 983/1751 [16:27:23<12:59:14, 60.88s/it] 56%|████████████████████████████████████████████████▉                                      | 984/1751 [16:28:24<12:57:50, 60.85s/it]                                                                                                                                     {'loss': '0.5153', 'grad_norm': '0.1895', 'learning_rate': '8.795e-06', 'ppl': '1.674', 'memory/max_active (GiB)': '76.61', 'memory/max_allocated (GiB)': '76.61', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '93.84', 'tokens/total': 1868064512, 'tokens/trainable': 690408384, 'epoch': '1.685'}
 56%|████████████████████████████████████████████████▉                                      | 984/1751 [16:28:24<12:57:50, 60.85s/it] 56%|████████████████████████████████████████████████▉                                      | 985/1751 [16:29:26<13:03:26, 61.37s/it]                                                                                                                                     {'loss': '0.5159', 'grad_norm': '0.1777', 'learning_rate': '8.776e-06', 'ppl': '1.675', 'memory/max_active (GiB)': '76.25', 'memory/max_allocated (GiB)': '76.25', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '104', 'tokens/total': 1870099200, 'tokens/trainable': 691144128, 'epoch': '1.687'}
 56%|████████████████████████████████████████████████▉                                      | 985/1751 [16:29:26<13:03:26, 61.37s/it] 56%|████████████████████████████████████████████████▉                                      | 986/1751 [16:30:25<12:50:41, 60.45s/it]                                                                                                                                     {'loss': '0.512', 'grad_norm': '0.1787', 'learning_rate': '8.757e-06', 'ppl': '1.669', 'memory/max_active (GiB)': '68.92', 'memory/max_allocated (GiB)': '68.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '104.9', 'tokens/total': 1871912192, 'tokens/trainable': 691824128, 'epoch': '1.689'}
 56%|████████████████████████████████████████████████▉                                      | 986/1751 [16:30:25<12:50:41, 60.45s/it] 56%|█████████████████████████████████████████████████                                      | 987/1751 [16:31:28<13:01:01, 61.34s/it]                                                                                                                                     {'loss': '0.4875', 'grad_norm': '0.165', 'learning_rate': '8.738e-06', 'ppl': '1.628', 'memory/max_active (GiB)': '77.28', 'memory/max_allocated (GiB)': '77.28', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '174.3', 'tokens/total': 1873936640, 'tokens/trainable': 692576960, 'epoch': '1.691'}
 56%|█████████████████████████████████████████████████                                      | 987/1751 [16:31:28<13:01:01, 61.34s/it] 56%|█████████████████████████████████████████████████                                      | 988/1751 [16:32:28<12:53:39, 60.84s/it]                                                                                                                                     {'loss': '0.5361', 'grad_norm': '0.1797', 'learning_rate': '8.72e-06', 'ppl': '1.709', 'memory/max_active (GiB)': '69.59', 'memory/max_allocated (GiB)': '69.59', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '98.87', 'tokens/total': 1875811456, 'tokens/trainable': 693297344, 'epoch': '1.692'}
 56%|█████████████████████████████████████████████████                                      | 988/1751 [16:32:28<12:53:39, 60.84s/it] 56%|█████████████████████████████████████████████████▏                                     | 989/1751 [16:33:28<12:52:12, 60.80s/it]                                                                                                                                     {'loss': '0.503', 'grad_norm': '0.1797', 'learning_rate': '8.701e-06', 'ppl': '1.654', 'memory/max_active (GiB)': '75.76', 'memory/max_allocated (GiB)': '75.76', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '146.6', 'tokens/total': 1877728768, 'tokens/trainable': 693991552, 'epoch': '1.694'}
 56%|█████████████████████████████████████████████████▏                                     | 989/1751 [16:33:28<12:52:12, 60.80s/it] 57%|█████████████████████████████████████████████████▏                                     | 990/1751 [16:34:28<12:48:01, 60.55s/it]                                                                                                                                     {'loss': '0.5163', 'grad_norm': '0.2012', 'learning_rate': '8.682e-06', 'ppl': '1.676', 'memory/max_active (GiB)': '73.19', 'memory/max_allocated (GiB)': '73.19', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '62.95', 'tokens/total': 1879577600, 'tokens/trainable': 694667136, 'epoch': '1.696'}
 57%|█████████████████████████████████████████████████▏                                     | 990/1751 [16:34:28<12:48:01, 60.55s/it] 57%|█████████████████████████████████████████████████▏                                     | 991/1751 [16:35:30<12:50:12, 60.81s/it]                                                                                                                                     {'loss': '0.5326', 'grad_norm': '0.1914', 'learning_rate': '8.664e-06', 'ppl': '1.703', 'memory/max_active (GiB)': '75.62', 'memory/max_allocated (GiB)': '75.62', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '57.08', 'tokens/total': 1881535488, 'tokens/trainable': 695391680, 'epoch': '1.697'}
 57%|█████████████████████████████████████████████████▏                                     | 991/1751 [16:35:30<12:50:12, 60.81s/it] 57%|█████████████████████████████████████████████████▎                                     | 992/1751 [16:36:30<12:47:41, 60.69s/it]                                                                                                                                     {'loss': '0.537', 'grad_norm': '0.1797', 'learning_rate': '8.645e-06', 'ppl': '1.711', 'memory/max_active (GiB)': '71.36', 'memory/max_allocated (GiB)': '71.36', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '33.56', 'tokens/total': 1883399040, 'tokens/trainable': 696098624, 'epoch': '1.699'}
 57%|█████████████████████████████████████████████████▎                                     | 992/1751 [16:36:30<12:47:41, 60.69s/it] 57%|█████████████████████████████████████████████████▎                                     | 993/1751 [16:37:27<12:32:25, 59.56s/it]                                                                                                                                     {'loss': '0.5609', 'grad_norm': '0.1973', 'learning_rate': '8.626e-06', 'ppl': '1.752', 'memory/max_active (GiB)': '70.44', 'memory/max_allocated (GiB)': '70.44', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '85.62', 'tokens/total': 1885170944, 'tokens/trainable': 696774272, 'epoch': '1.701'}
 57%|█████████████████████████████████████████████████▎                                     | 993/1751 [16:37:27<12:32:25, 59.56s/it] 57%|█████████████████████████████████████████████████▍                                     | 994/1751 [16:38:24<12:23:01, 58.89s/it]                                                                                                                                     {'loss': '0.5123', 'grad_norm': '0.1992', 'learning_rate': '8.607e-06', 'ppl': '1.669', 'memory/max_active (GiB)': '71.68', 'memory/max_allocated (GiB)': '71.68', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '60.62', 'tokens/total': 1886968320, 'tokens/trainable': 697426048, 'epoch': '1.703'}
 57%|█████████████████████████████████████████████████▍                                     | 994/1751 [16:38:24<12:23:01, 58.89s/it] 57%|█████████████████████████████████████████████████▍                                     | 995/1751 [16:39:24<12:23:53, 59.04s/it]                                                                                                                                     {'loss': '0.4994', 'grad_norm': '0.1875', 'learning_rate': '8.589e-06', 'ppl': '1.648', 'memory/max_active (GiB)': '73.77', 'memory/max_allocated (GiB)': '73.77', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '85.99', 'tokens/total': 1888859392, 'tokens/trainable': 698112384, 'epoch': '1.704'}
 57%|█████████████████████████████████████████████████▍                                     | 995/1751 [16:39:24<12:23:53, 59.04s/it] 57%|█████████████████████████████████████████████████▍                                     | 996/1751 [16:40:25<12:30:27, 59.64s/it]                                                                                                                                     {'loss': '0.5322', 'grad_norm': '0.1807', 'learning_rate': '8.57e-06', 'ppl': '1.703', 'memory/max_active (GiB)': '75.11', 'memory/max_allocated (GiB)': '75.11', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '56.95', 'tokens/total': 1890807680, 'tokens/trainable': 698811648, 'epoch': '1.706'}
 57%|█████████████████████████████████████████████████▍                                     | 996/1751 [16:40:25<12:30:27, 59.64s/it] 57%|█████████████████████████████████████████████████▌                                     | 997/1751 [16:41:24<12:27:10, 59.46s/it]                                                                                                                                     {'loss': '0.5116', 'grad_norm': '0.1846', 'learning_rate': '8.551e-06', 'ppl': '1.668', 'memory/max_active (GiB)': '76.01', 'memory/max_allocated (GiB)': '76.01', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '80.6', 'tokens/total': 1892646016, 'tokens/trainable': 699505280, 'epoch': '1.708'}
 57%|█████████████████████████████████████████████████▌                                     | 997/1751 [16:41:24<12:27:10, 59.46s/it] 57%|█████████████████████████████████████████████████▌                                     | 998/1751 [16:42:26<12:34:32, 60.12s/it]                                                                                                                                     {'loss': '0.5379', 'grad_norm': '0.1816', 'learning_rate': '8.533e-06', 'ppl': '1.712', 'memory/max_active (GiB)': '72.38', 'memory/max_allocated (GiB)': '72.38', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '97.61', 'tokens/total': 1894593920, 'tokens/trainable': 700230016, 'epoch': '1.709'}
 57%|█████████████████████████████████████████████████▌                                     | 998/1751 [16:42:26<12:34:32, 60.12s/it] 57%|█████████████████████████████████████████████████▋                                     | 999/1751 [16:43:25<12:29:22, 59.79s/it]                                                                                                                                     {'loss': '0.5356', 'grad_norm': '0.1934', 'learning_rate': '8.514e-06', 'ppl': '1.709', 'memory/max_active (GiB)': '75.21', 'memory/max_allocated (GiB)': '75.21', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '205.7', 'tokens/total': 1896443904, 'tokens/trainable': 700942208, 'epoch': '1.711'}
 57%|█████████████████████████████████████████████████▋                                     | 999/1751 [16:43:25<12:29:22, 59.79s/it] 57%|█████████████████████████████████████████████████                                     | 1000/1751 [16:44:23<12:22:58, 59.36s/it]                                                                                                                                     {'loss': '0.5508', 'grad_norm': '0.1846', 'learning_rate': '8.495e-06', 'ppl': '1.735', 'memory/max_active (GiB)': '72.26', 'memory/max_allocated (GiB)': '72.26', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '115.3', 'tokens/total': 1898266368, 'tokens/trainable': 701591424, 'epoch': '1.713'}
 57%|█████████████████████████████████████████████████                                     | 1000/1751 [16:44:23<12:22:58, 59.36s/it] 57%|█████████████████████████████████████████████████▏                                    | 1001/1751 [16:45:23<12:24:03, 59.52s/it]                                                                                                                                     {'loss': '0.5342', 'grad_norm': '0.1846', 'learning_rate': '8.477e-06', 'ppl': '1.706', 'memory/max_active (GiB)': '74.85', 'memory/max_allocated (GiB)': '74.85', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '144.3', 'tokens/total': 1900136064, 'tokens/trainable': 702289280, 'epoch': '1.715'}
 57%|█████████████████████████████████████████████████▏                                    | 1001/1751 [16:45:23<12:24:03, 59.52s/it] 57%|█████████████████████████████████████████████████▏                                    | 1002/1751 [16:46:26<12:37:51, 60.71s/it]                                                                                                                                     {'loss': '0.4925', 'grad_norm': '0.168', 'learning_rate': '8.458e-06', 'ppl': '1.636', 'memory/max_active (GiB)': '77.45', 'memory/max_allocated (GiB)': '77.45', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '89.95', 'tokens/total': 1902167040, 'tokens/trainable': 703056960, 'epoch': '1.716'}
 57%|█████████████████████████████████████████████████▏                                    | 1002/1751 [16:46:26<12:37:51, 60.71s/it] 57%|█████████████████████████████████████████████████▎                                    | 1003/1751 [16:47:26<12:31:32, 60.28s/it]                                                                                                                                     {'loss': '0.524', 'grad_norm': '0.1797', 'learning_rate': '8.439e-06', 'ppl': '1.689', 'memory/max_active (GiB)': '73.19', 'memory/max_allocated (GiB)': '73.19', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '40.06', 'tokens/total': 1904029184, 'tokens/trainable': 703726144, 'epoch': '1.718'}
 57%|█████████████████████████████████████████████████▎                                    | 1003/1751 [16:47:26<12:31:32, 60.28s/it] 57%|█████████████████████████████████████████████████▎                                    | 1004/1751 [16:48:27<12:33:17, 60.51s/it]                                                                                                                                     {'loss': '0.4878', 'grad_norm': '0.1729', 'learning_rate': '8.421e-06', 'ppl': '1.629', 'memory/max_active (GiB)': '75.28', 'memory/max_allocated (GiB)': '75.28', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '51.52', 'tokens/total': 1905956096, 'tokens/trainable': 704452736, 'epoch': '1.72'}
 57%|█████████████████████████████████████████████████▎                                    | 1004/1751 [16:48:27<12:33:17, 60.51s/it] 57%|█████████████████████████████████████████████████▎                                    | 1005/1751 [16:49:25<12:25:57, 60.00s/it]                                                                                                                                     {'loss': '0.525', 'grad_norm': '0.1846', 'learning_rate': '8.402e-06', 'ppl': '1.69', 'memory/max_active (GiB)': '75.92', 'memory/max_allocated (GiB)': '75.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '101.5', 'tokens/total': 1907742336, 'tokens/trainable': 705111104, 'epoch': '1.721'}
 57%|█████████████████████████████████████████████████▎                                    | 1005/1751 [16:49:25<12:25:57, 60.00s/it] 57%|█████████████████████████████████████████████████▍                                    | 1006/1751 [16:50:24<12:19:11, 59.53s/it]                                                                                                                                     {'loss': '0.531', 'grad_norm': '0.1807', 'learning_rate': '8.383e-06', 'ppl': '1.701', 'memory/max_active (GiB)': '74.17', 'memory/max_allocated (GiB)': '74.17', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '119.4', 'tokens/total': 1909544832, 'tokens/trainable': 705786048, 'epoch': '1.723'}
 57%|█████████████████████████████████████████████████▍                                    | 1006/1751 [16:50:24<12:19:11, 59.53s/it] 58%|█████████████████████████████████████████████████▍                                    | 1007/1751 [16:51:25<12:23:55, 59.99s/it]                                                                                                                                     {'loss': '0.4946', 'grad_norm': '0.1709', 'learning_rate': '8.365e-06', 'ppl': '1.64', 'memory/max_active (GiB)': '70.77', 'memory/max_allocated (GiB)': '70.77', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '105', 'tokens/total': 1911506944, 'tokens/trainable': 706530240, 'epoch': '1.725'}
 58%|█████████████████████████████████████████████████▍                                    | 1007/1751 [16:51:25<12:23:55, 59.99s/it] 58%|█████████████████████████████████████████████████▌                                    | 1008/1751 [16:52:23<12:16:29, 59.47s/it]                                                                                                                                     {'loss': '0.5317', 'grad_norm': '0.1973', 'learning_rate': '8.346e-06', 'ppl': '1.702', 'memory/max_active (GiB)': '71.23', 'memory/max_allocated (GiB)': '71.23', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '48.7', 'tokens/total': 1913314816, 'tokens/trainable': 707175040, 'epoch': '1.727'}
 58%|█████████████████████████████████████████████████▌                                    | 1008/1751 [16:52:23<12:16:29, 59.47s/it] 58%|█████████████████████████████████████████████████▌                                    | 1009/1751 [16:53:23<12:15:56, 59.51s/it]                                                                                                                                     {'loss': '0.5263', 'grad_norm': '0.1797', 'learning_rate': '8.328e-06', 'ppl': '1.693', 'memory/max_active (GiB)': '72.86', 'memory/max_allocated (GiB)': '72.86', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '155.4', 'tokens/total': 1915175680, 'tokens/trainable': 707886976, 'epoch': '1.728'}
 58%|█████████████████████████████████████████████████▌                                    | 1009/1751 [16:53:23<12:15:56, 59.51s/it] 58%|█████████████████████████████████████████████████▌                                    | 1010/1751 [16:54:24<12:19:40, 59.89s/it]                                                                                                                                     {'loss': '0.5303', 'grad_norm': '0.1943', 'learning_rate': '8.309e-06', 'ppl': '1.699', 'memory/max_active (GiB)': '74.12', 'memory/max_allocated (GiB)': '74.12', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '53.45', 'tokens/total': 1917061760, 'tokens/trainable': 708586496, 'epoch': '1.73'}
 58%|█████████████████████████████████████████████████▌                                    | 1010/1751 [16:54:24<12:19:40, 59.89s/it] 58%|█████████████████████████████████████████████████▋                                    | 1011/1751 [16:55:25<12:26:00, 60.49s/it]                                                                                                                                     {'loss': '0.4906', 'grad_norm': '0.1709', 'learning_rate': '8.29e-06', 'ppl': '1.633', 'memory/max_active (GiB)': '75.1', 'memory/max_allocated (GiB)': '75.1', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '26.93', 'tokens/total': 1919013632, 'tokens/trainable': 709338432, 'epoch': '1.732'}
 58%|█████████████████████████████████████████████████▋                                    | 1011/1751 [16:55:25<12:26:00, 60.49s/it] 58%|█████████████████████████████████████████████████▋                                    | 1012/1751 [16:56:27<12:30:10, 60.91s/it]                                                                                                                                     {'loss': '0.4948', 'grad_norm': '0.1709', 'learning_rate': '8.272e-06', 'ppl': '1.64', 'memory/max_active (GiB)': '75.02', 'memory/max_allocated (GiB)': '75.02', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '109.4', 'tokens/total': 1920958080, 'tokens/trainable': 710039488, 'epoch': '1.733'}
 58%|█████████████████████████████████████████████████▋                                    | 1012/1751 [16:56:27<12:30:10, 60.91s/it] 58%|█████████████████████████████████████████████████▊                                    | 1013/1751 [16:57:27<12:23:57, 60.48s/it]                                                                                                                                     {'loss': '0.5302', 'grad_norm': '0.1943', 'learning_rate': '8.253e-06', 'ppl': '1.699', 'memory/max_active (GiB)': '74.69', 'memory/max_allocated (GiB)': '74.69', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '133.9', 'tokens/total': 1922828160, 'tokens/trainable': 710735616, 'epoch': '1.735'}
 58%|█████████████████████████████████████████████████▊                                    | 1013/1751 [16:57:27<12:23:57, 60.48s/it] 58%|█████████████████████████████████████████████████▊                                    | 1014/1751 [16:58:25<12:13:50, 59.74s/it]                                                                                                                                     {'loss': '0.5226', 'grad_norm': '0.1885', 'learning_rate': '8.235e-06', 'ppl': '1.686', 'memory/max_active (GiB)': '72.01', 'memory/max_allocated (GiB)': '72.01', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '88.98', 'tokens/total': 1924610048, 'tokens/trainable': 711369152, 'epoch': '1.737'}
 58%|█████████████████████████████████████████████████▊                                    | 1014/1751 [16:58:25<12:13:50, 59.74s/it] 58%|█████████████████████████████████████████████████▊                                    | 1015/1751 [16:59:25<12:12:55, 59.75s/it]                                                                                                                                     {'loss': '0.4864', 'grad_norm': '0.1758', 'learning_rate': '8.216e-06', 'ppl': '1.626', 'memory/max_active (GiB)': '76.9', 'memory/max_allocated (GiB)': '76.9', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '98.39', 'tokens/total': 1926497152, 'tokens/trainable': 712058304, 'epoch': '1.739'}
 58%|█████████████████████████████████████████████████▊                                    | 1015/1751 [16:59:25<12:12:55, 59.75s/it] 58%|█████████████████████████████████████████████████▉                                    | 1016/1751 [17:00:23<12:05:13, 59.20s/it]                                                                                                                                     {'loss': '0.5372', 'grad_norm': '0.1934', 'learning_rate': '8.197e-06', 'ppl': '1.711', 'memory/max_active (GiB)': '74.44', 'memory/max_allocated (GiB)': '74.44', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '62.36', 'tokens/total': 1928309504, 'tokens/trainable': 712729408, 'epoch': '1.74'}
 58%|█████████████████████████████████████████████████▉                                    | 1016/1751 [17:00:23<12:05:13, 59.20s/it] 58%|█████████████████████████████████████████████████▉                                    | 1017/1751 [17:01:25<12:17:36, 60.30s/it]                                                                                                                                     {'loss': '0.5105', 'grad_norm': '0.1826', 'learning_rate': '8.179e-06', 'ppl': '1.666', 'memory/max_active (GiB)': '75.2', 'memory/max_allocated (GiB)': '75.2', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '156.3', 'tokens/total': 1930316672, 'tokens/trainable': 713464512, 'epoch': '1.742'}
 58%|█████████████████████████████████████████████████▉                                    | 1017/1751 [17:01:25<12:17:36, 60.30s/it] 58%|█████████████████████████████████████████████████▉                                    | 1018/1751 [17:02:26<12:17:24, 60.36s/it]                                                                                                                                     {'loss': '0.5433', 'grad_norm': '0.1943', 'learning_rate': '8.16e-06', 'ppl': '1.722', 'memory/max_active (GiB)': '75.54', 'memory/max_allocated (GiB)': '75.54', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '124.1', 'tokens/total': 1932251776, 'tokens/trainable': 714166016, 'epoch': '1.744'}
 58%|█████████████████████████████████████████████████▉                                    | 1018/1751 [17:02:26<12:17:24, 60.36s/it] 58%|██████████████████████████████████████████████████                                    | 1019/1751 [17:03:26<12:17:08, 60.42s/it]                                                                                                                                     {'loss': '0.5152', 'grad_norm': '0.1963', 'learning_rate': '8.142e-06', 'ppl': '1.674', 'memory/max_active (GiB)': '74.76', 'memory/max_allocated (GiB)': '74.76', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '31.79', 'tokens/total': 1934132480, 'tokens/trainable': 714872640, 'epoch': '1.745'}
 58%|██████████████████████████████████████████████████                                    | 1019/1751 [17:03:26<12:17:08, 60.42s/it] 58%|██████████████████████████████████████████████████                                    | 1020/1751 [17:04:24<12:06:34, 59.64s/it]                                                                                                                                     {'loss': '0.5506', 'grad_norm': '0.2061', 'learning_rate': '8.123e-06', 'ppl': '1.734', 'memory/max_active (GiB)': '74.22', 'memory/max_allocated (GiB)': '74.22', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '114.6', 'tokens/total': 1935929216, 'tokens/trainable': 715530112, 'epoch': '1.747'}
 58%|██████████████████████████████████████████████████                                    | 1020/1751 [17:04:24<12:06:34, 59.64s/it] 58%|██████████████████████████████████████████████████▏                                   | 1021/1751 [17:05:23<12:03:35, 59.47s/it]                                                                                                                                     {'loss': '0.5425', 'grad_norm': '0.1934', 'learning_rate': '8.105e-06', 'ppl': '1.72', 'memory/max_active (GiB)': '73.57', 'memory/max_allocated (GiB)': '73.57', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '47.04', 'tokens/total': 1937806592, 'tokens/trainable': 716232704, 'epoch': '1.749'}
 58%|██████████████████████████████████████████████████▏                                   | 1021/1751 [17:05:23<12:03:35, 59.47s/it] 58%|██████████████████████████████████████████████████▏                                   | 1022/1751 [17:06:23<12:03:36, 59.56s/it]                                                                                                                                     {'loss': '0.4863', 'grad_norm': '0.1709', 'learning_rate': '8.086e-06', 'ppl': '1.626', 'memory/max_active (GiB)': '69.54', 'memory/max_allocated (GiB)': '69.54', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '69.48', 'tokens/total': 1939682688, 'tokens/trainable': 716909568, 'epoch': '1.751'}
 58%|██████████████████████████████████████████████████▏                                   | 1022/1751 [17:06:23<12:03:36, 59.56s/it] 58%|██████████████████████████████████████████████████▏                                   | 1023/1751 [17:07:24<12:09:11, 60.10s/it]                                                                                                                                     {'loss': '0.5313', 'grad_norm': '0.1816', 'learning_rate': '8.068e-06', 'ppl': '1.701', 'memory/max_active (GiB)': '69.99', 'memory/max_allocated (GiB)': '69.99', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '116', 'tokens/total': 1941655552, 'tokens/trainable': 717627008, 'epoch': '1.752'}
 58%|██████████████████████████████████████████████████▏                                   | 1023/1751 [17:07:24<12:09:11, 60.10s/it] 58%|██████████████████████████████████████████████████▎                                   | 1024/1751 [17:08:26<12:14:56, 60.66s/it]                                                                                                                                     {'loss': '0.5135', 'grad_norm': '0.1816', 'learning_rate': '8.049e-06', 'ppl': '1.671', 'memory/max_active (GiB)': '72.89', 'memory/max_allocated (GiB)': '72.89', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '93.79', 'tokens/total': 1943619712, 'tokens/trainable': 718363520, 'epoch': '1.754'}
 58%|██████████████████████████████████████████████████▎                                   | 1024/1751 [17:08:26<12:14:56, 60.66s/it] 59%|██████████████████████████████████████████████████▎                                   | 1025/1751 [17:09:26<12:08:35, 60.21s/it]                                                                                                                                     {'loss': '0.4957', 'grad_norm': '0.1836', 'learning_rate': '8.031e-06', 'ppl': '1.642', 'memory/max_active (GiB)': '72.91', 'memory/max_allocated (GiB)': '72.91', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '145.6', 'tokens/total': 1945459584, 'tokens/trainable': 719040640, 'epoch': '1.756'}
 59%|██████████████████████████████████████████████████▎                                   | 1025/1751 [17:09:26<12:08:35, 60.21s/it] 59%|██████████████████████████████████████████████████▍                                   | 1026/1751 [17:10:24<12:00:10, 59.60s/it]                                                                                                                                     {'loss': '0.5162', 'grad_norm': '0.1934', 'learning_rate': '8.012e-06', 'ppl': '1.676', 'memory/max_active (GiB)': '74.75', 'memory/max_allocated (GiB)': '74.75', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '69.1', 'tokens/total': 1947308928, 'tokens/trainable': 719719680, 'epoch': '1.757'}
 59%|██████████████████████████████████████████████████▍                                   | 1026/1751 [17:10:24<12:00:10, 59.60s/it] 59%|██████████████████████████████████████████████████▍                                   | 1027/1751 [17:11:22<11:53:43, 59.15s/it]                                                                                                                                     {'loss': '0.4983', 'grad_norm': '0.1768', 'learning_rate': '7.994e-06', 'ppl': '1.646', 'memory/max_active (GiB)': '76.68', 'memory/max_allocated (GiB)': '76.68', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '78.86', 'tokens/total': 1949128320, 'tokens/trainable': 720400384, 'epoch': '1.759'}
 59%|██████████████████████████████████████████████████▍                                   | 1027/1751 [17:11:22<11:53:43, 59.15s/it] 59%|██████████████████████████████████████████████████▍                                   | 1028/1751 [17:12:22<11:58:04, 59.59s/it]                                                                                                                                     {'loss': '0.5133', 'grad_norm': '0.1865', 'learning_rate': '7.975e-06', 'ppl': '1.671', 'memory/max_active (GiB)': '71.88', 'memory/max_allocated (GiB)': '71.88', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '128.1', 'tokens/total': 1951027968, 'tokens/trainable': 721127040, 'epoch': '1.761'}
 59%|██████████████████████████████████████████████████▍                                   | 1028/1751 [17:12:22<11:58:04, 59.59s/it] 59%|██████████████████████████████████████████████████▌                                   | 1029/1751 [17:13:24<12:02:23, 60.03s/it]                                                                                                                                     {'loss': '0.521', 'grad_norm': '0.1777', 'learning_rate': '7.957e-06', 'ppl': '1.684', 'memory/max_active (GiB)': '75.21', 'memory/max_allocated (GiB)': '75.21', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '37.92', 'tokens/total': 1952998400, 'tokens/trainable': 721846400, 'epoch': '1.763'}
 59%|██████████████████████████████████████████████████▌                                   | 1029/1751 [17:13:24<12:02:23, 60.03s/it] 59%|██████████████████████████████████████████████████▌                                   | 1030/1751 [17:14:27<12:12:59, 61.00s/it]                                                                                                                                     {'loss': '0.493', 'grad_norm': '0.1758', 'learning_rate': '7.938e-06', 'ppl': '1.637', 'memory/max_active (GiB)': '73.92', 'memory/max_allocated (GiB)': '73.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '114.1', 'tokens/total': 1955032832, 'tokens/trainable': 722584256, 'epoch': '1.764'}
 59%|██████████████████████████████████████████████████▌                                   | 1030/1751 [17:14:27<12:12:59, 61.00s/it] 59%|██████████████████████████████████████████████████▋                                   | 1031/1751 [17:15:25<12:00:16, 60.02s/it]                                                                                                                                     {'loss': '0.5634', 'grad_norm': '0.1904', 'learning_rate': '7.92e-06', 'ppl': '1.757', 'memory/max_active (GiB)': '71.87', 'memory/max_allocated (GiB)': '71.87', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '163.7', 'tokens/total': 1956821760, 'tokens/trainable': 723237824, 'epoch': '1.766'}
 59%|██████████████████████████████████████████████████▋                                   | 1031/1751 [17:15:25<12:00:16, 60.02s/it] 59%|██████████████████████████████████████████████████▋                                   | 1032/1751 [17:16:27<12:08:59, 60.83s/it]                                                                                                                                     {'loss': '0.5142', 'grad_norm': '0.1709', 'learning_rate': '7.901e-06', 'ppl': '1.672', 'memory/max_active (GiB)': '73.5', 'memory/max_allocated (GiB)': '73.5', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '86.79', 'tokens/total': 1958819584, 'tokens/trainable': 724011200, 'epoch': '1.768'}
 59%|██████████████████████████████████████████████████▋                                   | 1032/1751 [17:16:27<12:08:59, 60.83s/it] 59%|██████████████████████████████████████████████████▋                                   | 1033/1751 [17:17:29<12:10:35, 61.05s/it]                                                                                                                                     {'loss': '0.4834', 'grad_norm': '0.165', 'learning_rate': '7.883e-06', 'ppl': '1.621', 'memory/max_active (GiB)': '72.7', 'memory/max_allocated (GiB)': '72.7', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '126.9', 'tokens/total': 1960778752, 'tokens/trainable': 724756224, 'epoch': '1.769'}
 59%|██████████████████████████████████████████████████▋                                   | 1033/1751 [17:17:29<12:10:35, 61.05s/it] 59%|██████████████████████████████████████████████████▊                                   | 1034/1751 [17:18:31<12:12:44, 61.32s/it]                                                                                                                                     {'loss': '0.4794', 'grad_norm': '0.1777', 'learning_rate': '7.864e-06', 'ppl': '1.615', 'memory/max_active (GiB)': '71.88', 'memory/max_allocated (GiB)': '71.88', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '21.81', 'tokens/total': 1962728064, 'tokens/trainable': 725461248, 'epoch': '1.771'}
 59%|██████████████████████████████████████████████████▊                                   | 1034/1751 [17:18:31<12:12:44, 61.32s/it] 59%|██████████████████████████████████████████████████▊                                   | 1035/1751 [17:19:33<12:13:55, 61.50s/it]                                                                                                                                     {'loss': '0.5374', 'grad_norm': '0.2012', 'learning_rate': '7.846e-06', 'ppl': '1.712', 'memory/max_active (GiB)': '74.51', 'memory/max_allocated (GiB)': '74.51', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '155.7', 'tokens/total': 1964665600, 'tokens/trainable': 726163008, 'epoch': '1.773'}
 59%|██████████████████████████████████████████████████▊                                   | 1035/1751 [17:19:33<12:13:55, 61.50s/it] 59%|██████████████████████████████████████████████████▉                                   | 1036/1751 [17:20:31<12:00:41, 60.48s/it]                                                                                                                                     {'loss': '0.5429', 'grad_norm': '0.1924', 'learning_rate': '7.827e-06', 'ppl': '1.721', 'memory/max_active (GiB)': '74.77', 'memory/max_allocated (GiB)': '74.77', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '73.06', 'tokens/total': 1966447616, 'tokens/trainable': 726797824, 'epoch': '1.775'}
 59%|██████████████████████████████████████████████████▉                                   | 1036/1751 [17:20:31<12:00:41, 60.48s/it] 59%|██████████████████████████████████████████████████▉                                   | 1037/1751 [17:21:32<12:01:40, 60.64s/it]                                                                                                                                     {'loss': '0.5352', 'grad_norm': '0.1797', 'learning_rate': '7.809e-06', 'ppl': '1.708', 'memory/max_active (GiB)': '76.35', 'memory/max_allocated (GiB)': '76.35', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '160.2', 'tokens/total': 1968334848, 'tokens/trainable': 727538176, 'epoch': '1.776'}
 59%|██████████████████████████████████████████████████▉                                   | 1037/1751 [17:21:32<12:01:40, 60.64s/it] 59%|██████████████████████████████████████████████████▉                                   | 1038/1751 [17:22:30<11:52:22, 59.95s/it]                                                                                                                                     {'loss': '0.5697', 'grad_norm': '0.1973', 'learning_rate': '7.791e-06', 'ppl': '1.768', 'memory/max_active (GiB)': '72.27', 'memory/max_allocated (GiB)': '72.27', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '72.52', 'tokens/total': 1970159232, 'tokens/trainable': 728194688, 'epoch': '1.778'}
 59%|██████████████████████████████████████████████████▉                                   | 1038/1751 [17:22:30<11:52:22, 59.95s/it] 59%|███████████████████████████████████████████████████                                   | 1039/1751 [17:23:30<11:51:59, 60.00s/it]                                                                                                                                     {'loss': '0.5379', 'grad_norm': '0.1797', 'learning_rate': '7.772e-06', 'ppl': '1.712', 'memory/max_active (GiB)': '77.23', 'memory/max_allocated (GiB)': '77.23', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '91.34', 'tokens/total': 1972071552, 'tokens/trainable': 728891712, 'epoch': '1.78'}
 59%|███████████████████████████████████████████████████                                   | 1039/1751 [17:23:30<11:51:59, 60.00s/it] 59%|███████████████████████████████████████████████████                                   | 1040/1751 [17:24:30<11:48:37, 59.80s/it]                                                                                                                                     {'loss': '0.5432', 'grad_norm': '0.1875', 'learning_rate': '7.754e-06', 'ppl': '1.722', 'memory/max_active (GiB)': '70.56', 'memory/max_allocated (GiB)': '70.56', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '108.7', 'tokens/total': 1973930368, 'tokens/trainable': 729576448, 'epoch': '1.781'}
 59%|███████████████████████████████████████████████████                                   | 1040/1751 [17:24:30<11:48:37, 59.80s/it] 59%|███████████████████████████████████████████████████▏                                  | 1041/1751 [17:25:28<11:44:10, 59.51s/it]                                                                                                                                     {'loss': '0.558', 'grad_norm': '0.1777', 'learning_rate': '7.735e-06', 'ppl': '1.747', 'memory/max_active (GiB)': '74.52', 'memory/max_allocated (GiB)': '74.52', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '49.13', 'tokens/total': 1975791744, 'tokens/trainable': 730250560, 'epoch': '1.783'}
 59%|███████████████████████████████████████████████████▏                                  | 1041/1751 [17:25:28<11:44:10, 59.51s/it] 60%|███████████████████████████████████████████████████▏                                  | 1042/1751 [17:26:25<11:33:17, 58.67s/it]                                                                                                                                     {'loss': '0.5523', 'grad_norm': '0.2021', 'learning_rate': '7.717e-06', 'ppl': '1.737', 'memory/max_active (GiB)': '68.41', 'memory/max_allocated (GiB)': '68.41', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '52.16', 'tokens/total': 1977546368, 'tokens/trainable': 730894592, 'epoch': '1.785'}
 60%|███████████████████████████████████████████████████▏                                  | 1042/1751 [17:26:25<11:33:17, 58.67s/it] 60%|███████████████████████████████████████████████████▏                                  | 1043/1751 [17:27:27<11:43:18, 59.60s/it]                                                                                                                                     {'loss': '0.4793', 'grad_norm': '0.1885', 'learning_rate': '7.699e-06', 'ppl': '1.615', 'memory/max_active (GiB)': '71.35', 'memory/max_allocated (GiB)': '71.35', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '105.5', 'tokens/total': 1979496320, 'tokens/trainable': 731631872, 'epoch': '1.787'}
 60%|███████████████████████████████████████████████████▏                                  | 1043/1751 [17:27:27<11:43:18, 59.60s/it] 60%|███████████████████████████████████████████████████▎                                  | 1044/1751 [17:28:28<11:49:04, 60.18s/it]                                                                                                                                     {'loss': '0.5174', 'grad_norm': '0.1758', 'learning_rate': '7.68e-06', 'ppl': '1.678', 'memory/max_active (GiB)': '76.87', 'memory/max_allocated (GiB)': '76.87', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '60.53', 'tokens/total': 1981440896, 'tokens/trainable': 732345856, 'epoch': '1.788'}
 60%|███████████████████████████████████████████████████▎                                  | 1044/1751 [17:28:28<11:49:04, 60.18s/it] 60%|███████████████████████████████████████████████████▎                                  | 1045/1751 [17:29:29<11:47:50, 60.16s/it]                                                                                                                                     {'loss': '0.5633', 'grad_norm': '0.1875', 'learning_rate': '7.662e-06', 'ppl': '1.757', 'memory/max_active (GiB)': '76.65', 'memory/max_allocated (GiB)': '76.65', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '118.3', 'tokens/total': 1983316224, 'tokens/trainable': 733061568, 'epoch': '1.79'}
 60%|███████████████████████████████████████████████████▎                                  | 1045/1751 [17:29:29<11:47:50, 60.16s/it] 60%|███████████████████████████████████████████████████▎                                  | 1046/1751 [17:30:32<11:57:22, 61.05s/it]                                                                                                                                     {'loss': '0.5215', 'grad_norm': '0.1797', 'learning_rate': '7.644e-06', 'ppl': '1.685', 'memory/max_active (GiB)': '76.92', 'memory/max_allocated (GiB)': '76.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '164', 'tokens/total': 1985305088, 'tokens/trainable': 733786304, 'epoch': '1.792'}
 60%|███████████████████████████████████████████████████▎                                  | 1046/1751 [17:30:32<11:57:22, 61.05s/it] 60%|███████████████████████████████████████████████████▍                                  | 1047/1751 [17:31:32<11:52:18, 60.71s/it]                                                                                                                                     {'loss': '0.5187', 'grad_norm': '0.1777', 'learning_rate': '7.625e-06', 'ppl': '1.68', 'memory/max_active (GiB)': '73.88', 'memory/max_allocated (GiB)': '73.88', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '51.15', 'tokens/total': 1987179392, 'tokens/trainable': 734494080, 'epoch': '1.793'}
 60%|███████████████████████████████████████████████████▍                                  | 1047/1751 [17:31:32<11:52:18, 60.71s/it] 60%|███████████████████████████████████████████████████▍                                  | 1048/1751 [17:32:30<11:44:42, 60.15s/it]                                                                                                                                     {'loss': '0.5446', 'grad_norm': '0.1836', 'learning_rate': '7.607e-06', 'ppl': '1.724', 'memory/max_active (GiB)': '77.08', 'memory/max_allocated (GiB)': '77.08', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '57.86', 'tokens/total': 1989027584, 'tokens/trainable': 735197312, 'epoch': '1.795'}
 60%|███████████████████████████████████████████████████▍                                  | 1048/1751 [17:32:30<11:44:42, 60.15s/it] 60%|███████████████████████████████████████████████████▌                                  | 1049/1751 [17:33:28<11:35:51, 59.48s/it]                                                                                                                                     {'loss': '0.5837', 'grad_norm': '0.1943', 'learning_rate': '7.589e-06', 'ppl': '1.793', 'memory/max_active (GiB)': '72.78', 'memory/max_allocated (GiB)': '72.78', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '153.2', 'tokens/total': 1990846848, 'tokens/trainable': 735863168, 'epoch': '1.797'}
 60%|███████████████████████████████████████████████████▌                                  | 1049/1751 [17:33:28<11:35:51, 59.48s/it] 60%|███████████████████████████████████████████████████▌                                  | 1050/1751 [17:34:29<11:38:16, 59.77s/it]                                                                                                                                     {'loss': '0.5537', 'grad_norm': '0.1934', 'learning_rate': '7.57e-06', 'ppl': '1.74', 'memory/max_active (GiB)': '74.61', 'memory/max_allocated (GiB)': '74.61', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '92.75', 'tokens/total': 1992753920, 'tokens/trainable': 736558592, 'epoch': '1.799'}
 60%|███████████████████████████████████████████████████▌                                  | 1050/1751 [17:34:29<11:38:16, 59.77s/it] 60%|███████████████████████████████████████████████████▌                                  | 1051/1751 [17:35:33<11:52:51, 61.10s/it]                                                                                                                                     {'loss': '0.5124', 'grad_norm': '0.1689', 'learning_rate': '7.552e-06', 'ppl': '1.669', 'memory/max_active (GiB)': '76.15', 'memory/max_allocated (GiB)': '76.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '51.19', 'tokens/total': 1994809088, 'tokens/trainable': 737309888, 'epoch': '1.8'}
 60%|███████████████████████████████████████████████████▌                                  | 1051/1751 [17:35:33<11:52:51, 61.10s/it] 60%|███████████████████████████████████████████████████▋                                  | 1052/1751 [17:36:33<11:46:39, 60.66s/it]                                                                                                                                     {'loss': '0.5247', 'grad_norm': '0.1846', 'learning_rate': '7.534e-06', 'ppl': '1.69', 'memory/max_active (GiB)': '76.29', 'memory/max_allocated (GiB)': '76.29', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '54.71', 'tokens/total': 1996671360, 'tokens/trainable': 738018816, 'epoch': '1.802'}
 60%|███████████████████████████████████████████████████▋                                  | 1052/1751 [17:36:33<11:46:39, 60.66s/it] 60%|███████████████████████████████████████████████████▋                                  | 1053/1751 [17:37:32<11:39:52, 60.16s/it]                                                                                                                                     {'loss': '0.5365', 'grad_norm': '0.1914', 'learning_rate': '7.515e-06', 'ppl': '1.71', 'memory/max_active (GiB)': '76.46', 'memory/max_allocated (GiB)': '76.46', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '123.9', 'tokens/total': 1998534272, 'tokens/trainable': 738706304, 'epoch': '1.804'}
 60%|███████████████████████████████████████████████████▋                                  | 1053/1751 [17:37:32<11:39:52, 60.16s/it] 60%|███████████████████████████████████████████████████▊                                  | 1054/1751 [17:38:33<11:42:48, 60.50s/it]                                                                                                                                     {'loss': '0.5055', 'grad_norm': '0.1797', 'learning_rate': '7.497e-06', 'ppl': '1.658', 'memory/max_active (GiB)': '76.25', 'memory/max_allocated (GiB)': '76.25', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '179.8', 'tokens/total': 2000497664, 'tokens/trainable': 739449536, 'epoch': '1.805'}
 60%|███████████████████████████████████████████████████▊                                  | 1054/1751 [17:38:33<11:42:48, 60.50s/it] 60%|███████████████████████████████████████████████████▊                                  | 1055/1751 [17:39:33<11:38:38, 60.23s/it]                                                                                                                                     {'loss': '0.5682', 'grad_norm': '0.2061', 'learning_rate': '7.479e-06', 'ppl': '1.765', 'memory/max_active (GiB)': '75.61', 'memory/max_allocated (GiB)': '75.61', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '53.98', 'tokens/total': 2002337536, 'tokens/trainable': 740182784, 'epoch': '1.807'}
 60%|███████████████████████████████████████████████████▊                                  | 1055/1751 [17:39:33<11:38:38, 60.23s/it] 60%|███████████████████████████████████████████████████▊                                  | 1056/1751 [17:40:33<11:38:31, 60.30s/it]                                                                                                                                     {'loss': '0.5261', 'grad_norm': '0.1729', 'learning_rate': '7.46e-06', 'ppl': '1.692', 'memory/max_active (GiB)': '70.62', 'memory/max_allocated (GiB)': '70.62', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '132.2', 'tokens/total': 2004213376, 'tokens/trainable': 740899776, 'epoch': '1.809'}
 60%|███████████████████████████████████████████████████▊                                  | 1056/1751 [17:40:33<11:38:31, 60.30s/it] 60%|███████████████████████████████████████████████████▉                                  | 1057/1751 [17:41:34<11:38:53, 60.42s/it]                                                                                                                                     {'loss': '0.5109', 'grad_norm': '0.1797', 'learning_rate': '7.442e-06', 'ppl': '1.667', 'memory/max_active (GiB)': '74.6', 'memory/max_allocated (GiB)': '74.6', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '60.61', 'tokens/total': 2006148992, 'tokens/trainable': 741641792, 'epoch': '1.811'}
 60%|███████████████████████████████████████████████████▉                                  | 1057/1751 [17:41:34<11:38:53, 60.42s/it] 60%|███████████████████████████████████████████████████▉                                  | 1058/1751 [17:42:34<11:36:14, 60.28s/it]                                                                                                                                     {'loss': '0.5011', 'grad_norm': '0.1826', 'learning_rate': '7.424e-06', 'ppl': '1.65', 'memory/max_active (GiB)': '70.24', 'memory/max_allocated (GiB)': '70.24', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '75.81', 'tokens/total': 2008030976, 'tokens/trainable': 742352512, 'epoch': '1.812'}
 60%|███████████████████████████████████████████████████▉                                  | 1058/1751 [17:42:34<11:36:14, 60.28s/it] 60%|████████████████████████████████████████████████████                                  | 1059/1751 [17:43:34<11:34:13, 60.19s/it]                                                                                                                                     {'loss': '0.5536', 'grad_norm': '0.1943', 'learning_rate': '7.406e-06', 'ppl': '1.739', 'memory/max_active (GiB)': '73.12', 'memory/max_allocated (GiB)': '73.12', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '52.79', 'tokens/total': 2009922176, 'tokens/trainable': 743028480, 'epoch': '1.814'}
 60%|████████████████████████████████████████████████████                                  | 1059/1751 [17:43:34<11:34:13, 60.19s/it] 61%|████████████████████████████████████████████████████                                  | 1060/1751 [17:44:35<11:38:19, 60.64s/it]                                                                                                                                     {'loss': '0.4745', 'grad_norm': '0.1699', 'learning_rate': '7.388e-06', 'ppl': '1.607', 'memory/max_active (GiB)': '68.34', 'memory/max_allocated (GiB)': '68.34', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '75.1', 'tokens/total': 2011888640, 'tokens/trainable': 743728960, 'epoch': '1.816'}
 61%|████████████████████████████████████████████████████                                  | 1060/1751 [17:44:35<11:38:19, 60.64s/it] 61%|████████████████████████████████████████████████████                                  | 1061/1751 [17:45:37<11:40:50, 60.94s/it]                                                                                                                                     {'loss': '0.4693', 'grad_norm': '0.1807', 'learning_rate': '7.369e-06', 'ppl': '1.599', 'memory/max_active (GiB)': '72.91', 'memory/max_allocated (GiB)': '72.91', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '64.94', 'tokens/total': 2013878272, 'tokens/trainable': 744434624, 'epoch': '1.817'}
 61%|████████████████████████████████████████████████████                                  | 1061/1751 [17:45:37<11:40:50, 60.94s/it] 61%|████████████████████████████████████████████████████▏                                 | 1062/1751 [17:46:38<11:41:09, 61.06s/it]                                                                                                                                     {'loss': '0.4981', 'grad_norm': '0.1836', 'learning_rate': '7.351e-06', 'ppl': '1.646', 'memory/max_active (GiB)': '73.86', 'memory/max_allocated (GiB)': '73.86', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '108', 'tokens/total': 2015760384, 'tokens/trainable': 745153472, 'epoch': '1.819'}
 61%|████████████████████████████████████████████████████▏                                 | 1062/1751 [17:46:38<11:41:09, 61.06s/it] 61%|████████████████████████████████████████████████████▏                                 | 1063/1751 [17:47:39<11:39:14, 60.98s/it]                                                                                                                                     {'loss': '0.499', 'grad_norm': '0.1885', 'learning_rate': '7.333e-06', 'ppl': '1.647', 'memory/max_active (GiB)': '73.34', 'memory/max_allocated (GiB)': '73.34', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '30.79', 'tokens/total': 2017621760, 'tokens/trainable': 745860864, 'epoch': '1.821'}
 61%|████████████████████████████████████████████████████▏                                 | 1063/1751 [17:47:39<11:39:14, 60.98s/it] 61%|████████████████████████████████████████████████████▎                                 | 1064/1751 [17:48:39<11:33:37, 60.58s/it]                                                                                                                                     {'loss': '0.5349', 'grad_norm': '0.1846', 'learning_rate': '7.315e-06', 'ppl': '1.707', 'memory/max_active (GiB)': '74.02', 'memory/max_allocated (GiB)': '74.02', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '134.6', 'tokens/total': 2019509120, 'tokens/trainable': 746547456, 'epoch': '1.823'}
 61%|████████████████████████████████████████████████████▎                                 | 1064/1751 [17:48:39<11:33:37, 60.58s/it] 61%|████████████████████████████████████████████████████▎                                 | 1065/1751 [17:49:36<11:21:38, 59.62s/it]                                                                                                                                     {'loss': '0.5786', 'grad_norm': '0.2012', 'learning_rate': '7.296e-06', 'ppl': '1.784', 'memory/max_active (GiB)': '70.99', 'memory/max_allocated (GiB)': '70.99', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '90.44', 'tokens/total': 2021325952, 'tokens/trainable': 747212416, 'epoch': '1.824'}
 61%|████████████████████████████████████████████████████▎                                 | 1065/1751 [17:49:36<11:21:38, 59.62s/it] 61%|████████████████████████████████████████████████████▎                                 | 1066/1751 [17:50:36<11:21:45, 59.72s/it]                                                                                                                                     {'loss': '0.5004', 'grad_norm': '0.1895', 'learning_rate': '7.278e-06', 'ppl': '1.649', 'memory/max_active (GiB)': '76.66', 'memory/max_allocated (GiB)': '76.66', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '137.2', 'tokens/total': 2023199744, 'tokens/trainable': 747873408, 'epoch': '1.826'}
 61%|████████████████████████████████████████████████████▎                                 | 1066/1751 [17:50:36<11:21:45, 59.72s/it] 61%|████████████████████████████████████████████████████▍                                 | 1067/1751 [17:51:36<11:20:48, 59.72s/it]                                                                                                                                     {'loss': '0.5146', 'grad_norm': '0.1865', 'learning_rate': '7.26e-06', 'ppl': '1.673', 'memory/max_active (GiB)': '70.25', 'memory/max_allocated (GiB)': '70.25', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '34.2', 'tokens/total': 2025128960, 'tokens/trainable': 748582592, 'epoch': '1.828'}
 61%|████████████████████████████████████████████████████▍                                 | 1067/1751 [17:51:36<11:20:48, 59.72s/it] 61%|████████████████████████████████████████████████████▍                                 | 1068/1751 [17:52:38<11:27:46, 60.42s/it]                                                                                                                                     {'loss': '0.5271', 'grad_norm': '0.1729', 'learning_rate': '7.242e-06', 'ppl': '1.694', 'memory/max_active (GiB)': '71.88', 'memory/max_allocated (GiB)': '71.88', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '135.4', 'tokens/total': 2027080576, 'tokens/trainable': 749301376, 'epoch': '1.829'}
 61%|████████████████████████████████████████████████████▍                                 | 1068/1751 [17:52:38<11:27:46, 60.42s/it] 61%|████████████████████████████████████████████████████▌                                 | 1069/1751 [17:53:38<11:25:12, 60.28s/it]                                                                                                                                     {'loss': '0.5228', 'grad_norm': '0.1768', 'learning_rate': '7.224e-06', 'ppl': '1.687', 'memory/max_active (GiB)': '75.46', 'memory/max_allocated (GiB)': '75.46', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '109.4', 'tokens/total': 2028936576, 'tokens/trainable': 749974464, 'epoch': '1.831'}
 61%|████████████████████████████████████████████████████▌                                 | 1069/1751 [17:53:38<11:25:12, 60.28s/it] 61%|████████████████████████████████████████████████████▌                                 | 1070/1751 [17:54:36<11:15:38, 59.53s/it]                                                                                                                                     {'loss': '0.5585', 'grad_norm': '0.1934', 'learning_rate': '7.206e-06', 'ppl': '1.748', 'memory/max_active (GiB)': '72.91', 'memory/max_allocated (GiB)': '72.91', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '23.03', 'tokens/total': 2030721152, 'tokens/trainable': 750628096, 'epoch': '1.833'}
 61%|████████████████████████████████████████████████████▌                                 | 1070/1751 [17:54:36<11:15:38, 59.53s/it] 61%|████████████████████████████████████████████████████▌                                 | 1071/1751 [17:55:35<11:14:15, 59.49s/it]                                                                                                                                     {'loss': '0.5047', 'grad_norm': '0.1787', 'learning_rate': '7.188e-06', 'ppl': '1.657', 'memory/max_active (GiB)': '70.43', 'memory/max_allocated (GiB)': '70.43', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '26.18', 'tokens/total': 2032614912, 'tokens/trainable': 751313536, 'epoch': '1.835'}
 61%|████████████████████████████████████████████████████▌                                 | 1071/1751 [17:55:35<11:14:15, 59.49s/it] 61%|████████████████████████████████████████████████████▋                                 | 1072/1751 [17:56:32<11:04:38, 58.73s/it]                                                                                                                                     {'loss': '0.5514', 'grad_norm': '0.1914', 'learning_rate': '7.17e-06', 'ppl': '1.736', 'memory/max_active (GiB)': '71.55', 'memory/max_allocated (GiB)': '71.55', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '124.2', 'tokens/total': 2034379136, 'tokens/trainable': 751948480, 'epoch': '1.836'}
 61%|████████████████████████████████████████████████████▋                                 | 1072/1751 [17:56:32<11:04:38, 58.73s/it] 61%|████████████████████████████████████████████████████▋                                 | 1073/1751 [17:57:34<11:13:58, 59.64s/it]                                                                                                                                     {'loss': '0.4944', 'grad_norm': '0.1709', 'learning_rate': '7.151e-06', 'ppl': '1.639', 'memory/max_active (GiB)': '76.76', 'memory/max_allocated (GiB)': '76.76', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '79.9', 'tokens/total': 2036370816, 'tokens/trainable': 752702656, 'epoch': '1.838'}
 61%|████████████████████████████████████████████████████▋                                 | 1073/1751 [17:57:34<11:13:58, 59.64s/it] 61%|████████████████████████████████████████████████████▋                                 | 1074/1751 [17:58:34<11:13:33, 59.70s/it]                                                                                                                                     {'loss': '0.4967', 'grad_norm': '0.1875', 'learning_rate': '7.133e-06', 'ppl': '1.643', 'memory/max_active (GiB)': '75.41', 'memory/max_allocated (GiB)': '75.41', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '107.9', 'tokens/total': 2038281600, 'tokens/trainable': 753379136, 'epoch': '1.84'}
 61%|████████████████████████████████████████████████████▋                                 | 1074/1751 [17:58:34<11:13:33, 59.70s/it] 61%|████████████████████████████████████████████████████▊                                 | 1075/1751 [17:59:32<11:09:42, 59.44s/it]                                                                                                                                     {'loss': '0.5361', 'grad_norm': '0.1865', 'learning_rate': '7.115e-06', 'ppl': '1.709', 'memory/max_active (GiB)': '75.65', 'memory/max_allocated (GiB)': '75.65', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '40.37', 'tokens/total': 2040129664, 'tokens/trainable': 754071808, 'epoch': '1.841'}
 61%|████████████████████████████████████████████████████▊                                 | 1075/1751 [17:59:32<11:09:42, 59.44s/it] 61%|████████████████████████████████████████████████████▊                                 | 1076/1751 [18:00:33<11:13:49, 59.90s/it]                                                                                                                                     {'loss': '0.5252', 'grad_norm': '0.1768', 'learning_rate': '7.097e-06', 'ppl': '1.691', 'memory/max_active (GiB)': '74.41', 'memory/max_allocated (GiB)': '74.41', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '33.83', 'tokens/total': 2042067200, 'tokens/trainable': 754792512, 'epoch': '1.843'}
 61%|████████████████████████████████████████████████████▊                                 | 1076/1751 [18:00:33<11:13:49, 59.90s/it] 62%|████████████████████████████████████████████████████▉                                 | 1077/1751 [18:01:33<11:12:58, 59.91s/it]                                                                                                                                     {'loss': '0.5192', 'grad_norm': '0.1934', 'learning_rate': '7.079e-06', 'ppl': '1.681', 'memory/max_active (GiB)': '71.33', 'memory/max_allocated (GiB)': '71.33', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '75.82', 'tokens/total': 2044004992, 'tokens/trainable': 755491328, 'epoch': '1.845'}
 62%|████████████████████████████████████████████████████▉                                 | 1077/1751 [18:01:33<11:12:58, 59.91s/it] 62%|████████████████████████████████████████████████████▉                                 | 1078/1751 [18:02:32<11:07:03, 59.47s/it]                                                                                                                                     {'loss': '0.4971', 'grad_norm': '0.1768', 'learning_rate': '7.061e-06', 'ppl': '1.644', 'memory/max_active (GiB)': '75.74', 'memory/max_allocated (GiB)': '75.74', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '100.1', 'tokens/total': 2045850880, 'tokens/trainable': 756194496, 'epoch': '1.847'}
 62%|████████████████████████████████████████████████████▉                                 | 1078/1751 [18:02:32<11:07:03, 59.47s/it] 62%|████████████████████████████████████████████████████▉                                 | 1079/1751 [18:03:32<11:08:13, 59.66s/it]                                                                                                                                     {'loss': '0.5047', 'grad_norm': '0.1738', 'learning_rate': '7.043e-06', 'ppl': '1.656', 'memory/max_active (GiB)': '75.98', 'memory/max_allocated (GiB)': '75.98', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '66.41', 'tokens/total': 2047799936, 'tokens/trainable': 756902720, 'epoch': '1.848'}
 62%|████████████████████████████████████████████████████▉                                 | 1079/1751 [18:03:32<11:08:13, 59.66s/it] 62%|█████████████████████████████████████████████████████                                 | 1080/1751 [18:04:27<10:52:49, 58.37s/it]                                                                                                                                     {'loss': '0.5356', 'grad_norm': '0.1924', 'learning_rate': '7.025e-06', 'ppl': '1.709', 'memory/max_active (GiB)': '67.5', 'memory/max_allocated (GiB)': '67.5', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '95.69', 'tokens/total': 2049487232, 'tokens/trainable': 757545344, 'epoch': '1.85'}
 62%|█████████████████████████████████████████████████████                                 | 1080/1751 [18:04:27<10:52:49, 58.37s/it] 62%|█████████████████████████████████████████████████████                                 | 1081/1751 [18:05:26<10:52:58, 58.48s/it]                                                                                                                                     {'loss': '0.5327', 'grad_norm': '0.1934', 'learning_rate': '7.007e-06', 'ppl': '1.704', 'memory/max_active (GiB)': '75.84', 'memory/max_allocated (GiB)': '75.84', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '35.88', 'tokens/total': 2051338880, 'tokens/trainable': 758231680, 'epoch': '1.852'}
 62%|█████████████████████████████████████████████████████                                 | 1081/1751 [18:05:26<10:52:58, 58.48s/it] 62%|█████████████████████████████████████████████████████▏                                | 1082/1751 [18:06:28<11:03:32, 59.51s/it]                                                                                                                                     {'loss': '0.5205', 'grad_norm': '0.1709', 'learning_rate': '6.989e-06', 'ppl': '1.683', 'memory/max_active (GiB)': '72.85', 'memory/max_allocated (GiB)': '72.85', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '98.49', 'tokens/total': 2053248384, 'tokens/trainable': 758959808, 'epoch': '1.853'}
 62%|█████████████████████████████████████████████████████▏                                | 1082/1751 [18:06:28<11:03:32, 59.51s/it] 62%|█████████████████████████████████████████████████████▏                                | 1083/1751 [18:07:27<11:00:53, 59.36s/it]                                                                                                                                     {'loss': '0.5378', 'grad_norm': '0.1855', 'learning_rate': '6.971e-06', 'ppl': '1.712', 'memory/max_active (GiB)': '76.42', 'memory/max_allocated (GiB)': '76.42', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '131.4', 'tokens/total': 2055081216, 'tokens/trainable': 759642496, 'epoch': '1.855'}
 62%|█████████████████████████████████████████████████████▏                                | 1083/1751 [18:07:27<11:00:53, 59.36s/it] 62%|█████████████████████████████████████████████████████▏                                | 1084/1751 [18:08:26<11:00:31, 59.42s/it]                                                                                                                                     {'loss': '0.4914', 'grad_norm': '0.1758', 'learning_rate': '6.953e-06', 'ppl': '1.635', 'memory/max_active (GiB)': '76.03', 'memory/max_allocated (GiB)': '76.03', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '139.4', 'tokens/total': 2056931584, 'tokens/trainable': 760322624, 'epoch': '1.857'}
 62%|█████████████████████████████████████████████████████▏                                | 1084/1751 [18:08:26<11:00:31, 59.42s/it] 62%|█████████████████████████████████████████████████████▎                                | 1085/1751 [18:09:25<10:55:21, 59.04s/it]                                                                                                                                     {'loss': '0.541', 'grad_norm': '0.1924', 'learning_rate': '6.935e-06', 'ppl': '1.718', 'memory/max_active (GiB)': '76.42', 'memory/max_allocated (GiB)': '76.42', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '60.7', 'tokens/total': 2058771968, 'tokens/trainable': 760974016, 'epoch': '1.859'}
 62%|█████████████████████████████████████████████████████▎                                | 1085/1751 [18:09:25<10:55:21, 59.04s/it] 62%|█████████████████████████████████████████████████████▎                                | 1086/1751 [18:10:23<10:53:11, 58.93s/it]                                                                                                                                     {'loss': '0.5233', 'grad_norm': '0.1855', 'learning_rate': '6.917e-06', 'ppl': '1.688', 'memory/max_active (GiB)': '75.26', 'memory/max_allocated (GiB)': '75.26', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '37.58', 'tokens/total': 2060665856, 'tokens/trainable': 761639744, 'epoch': '1.86'}
 62%|█████████████████████████████████████████████████████▎                                | 1086/1751 [18:10:23<10:53:11, 58.93s/it] 62%|█████████████████████████████████████████████████████▍                                | 1087/1751 [18:11:21<10:49:58, 58.73s/it]                                                                                                                                     {'loss': '0.5251', 'grad_norm': '0.1797', 'learning_rate': '6.899e-06', 'ppl': '1.691', 'memory/max_active (GiB)': '70.61', 'memory/max_allocated (GiB)': '70.61', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '83', 'tokens/total': 2062570880, 'tokens/trainable': 762320128, 'epoch': '1.862'}
 62%|█████████████████████████████████████████████████████▍                                | 1087/1751 [18:11:22<10:49:58, 58.73s/it] 62%|█████████████████████████████████████████████████████▍                                | 1088/1751 [18:12:21<10:52:53, 59.08s/it]                                                                                                                                     {'loss': '0.4843', 'grad_norm': '0.1807', 'learning_rate': '6.881e-06', 'ppl': '1.623', 'memory/max_active (GiB)': '76.29', 'memory/max_allocated (GiB)': '76.29', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '80.62', 'tokens/total': 2064476288, 'tokens/trainable': 763023808, 'epoch': '1.864'}
 62%|█████████████████████████████████████████████████████▍                                | 1088/1751 [18:12:21<10:52:53, 59.08s/it] 62%|█████████████████████████████████████████████████████▍                                | 1089/1751 [18:13:19<10:46:19, 58.58s/it]                                                                                                                                     {'loss': '0.538', 'grad_norm': '0.1885', 'learning_rate': '6.863e-06', 'ppl': '1.713', 'memory/max_active (GiB)': '70', 'memory/max_allocated (GiB)': '70', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '42.92', 'tokens/total': 2066295168, 'tokens/trainable': 763676992, 'epoch': '1.865'}
 62%|█████████████████████████████████████████████████████▍                                | 1089/1751 [18:13:19<10:46:19, 58.58s/it] 62%|█████████████████████████████████████████████████████▌                                | 1090/1751 [18:14:19<10:49:28, 58.95s/it]                                                                                                                                     {'loss': '0.5391', 'grad_norm': '0.1777', 'learning_rate': '6.845e-06', 'ppl': '1.714', 'memory/max_active (GiB)': '73.6', 'memory/max_allocated (GiB)': '73.6', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '75.7', 'tokens/total': 2068138880, 'tokens/trainable': 764348736, 'epoch': '1.867'}
 62%|█████████████████████████████████████████████████████▌                                | 1090/1751 [18:14:19<10:49:28, 58.95s/it] 62%|█████████████████████████████████████████████████████▌                                | 1091/1751 [18:15:19<10:54:03, 59.46s/it]                                                                                                                                     {'loss': '0.5229', 'grad_norm': '0.1748', 'learning_rate': '6.827e-06', 'ppl': '1.687', 'memory/max_active (GiB)': '68.31', 'memory/max_allocated (GiB)': '68.31', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '78.77', 'tokens/total': 2070044032, 'tokens/trainable': 765051328, 'epoch': '1.869'}
 62%|█████████████████████████████████████████████████████▌                                | 1091/1751 [18:15:19<10:54:03, 59.46s/it] 62%|█████████████████████████████████████████████████████▋                                | 1092/1751 [18:16:16<10:44:56, 58.72s/it]                                                                                                                                     {'loss': '0.5369', 'grad_norm': '0.1943', 'learning_rate': '6.809e-06', 'ppl': '1.711', 'memory/max_active (GiB)': '75.23', 'memory/max_allocated (GiB)': '75.23', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '150', 'tokens/total': 2071825536, 'tokens/trainable': 765704320, 'epoch': '1.871'}
 62%|█████████████████████████████████████████████████████▋                                | 1092/1751 [18:16:16<10:44:56, 58.72s/it] 62%|█████████████████████████████████████████████████████▋                                | 1093/1751 [18:17:16<10:46:37, 58.96s/it]                                                                                                                                     {'loss': '0.5327', 'grad_norm': '0.1787', 'learning_rate': '6.792e-06', 'ppl': '1.704', 'memory/max_active (GiB)': '75.47', 'memory/max_allocated (GiB)': '75.47', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '10.84', 'tokens/total': 2073707648, 'tokens/trainable': 766403712, 'epoch': '1.872'}
 62%|█████████████████████████████████████████████████████▋                                | 1093/1751 [18:17:16<10:46:37, 58.96s/it] 62%|█████████████████████████████████████████████████████▋                                | 1094/1751 [18:18:18<10:55:40, 59.88s/it]                                                                                                                                     {'loss': '0.512', 'grad_norm': '0.1748', 'learning_rate': '6.774e-06', 'ppl': '1.669', 'memory/max_active (GiB)': '74.5', 'memory/max_allocated (GiB)': '74.5', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '59.82', 'tokens/total': 2075637120, 'tokens/trainable': 767113216, 'epoch': '1.874'}
 62%|█████████████████████████████████████████████████████▋                                | 1094/1751 [18:18:18<10:55:40, 59.88s/it] 63%|█████████████████████████████████████████████████████▊                                | 1095/1751 [18:19:21<11:04:21, 60.76s/it]                                                                                                                                     {'loss': '0.5022', 'grad_norm': '0.1729', 'learning_rate': '6.756e-06', 'ppl': '1.652', 'memory/max_active (GiB)': '75.84', 'memory/max_allocated (GiB)': '75.84', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '57.39', 'tokens/total': 2077654272, 'tokens/trainable': 767879616, 'epoch': '1.876'}
 63%|█████████████████████████████████████████████████████▊                                | 1095/1751 [18:19:21<11:04:21, 60.76s/it] 63%|█████████████████████████████████████████████████████▊                                | 1096/1751 [18:20:22<11:04:01, 60.83s/it]                                                                                                                                     {'loss': '0.4829', 'grad_norm': '0.1699', 'learning_rate': '6.738e-06', 'ppl': '1.621', 'memory/max_active (GiB)': '77.3', 'memory/max_allocated (GiB)': '77.3', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '50.69', 'tokens/total': 2079571712, 'tokens/trainable': 768604736, 'epoch': '1.877'}
 63%|█████████████████████████████████████████████████████▊                                | 1096/1751 [18:20:22<11:04:01, 60.83s/it] 63%|█████████████████████████████████████████████████████▉                                | 1097/1751 [18:21:18<10:48:48, 59.52s/it]                                                                                                                                     {'loss': '0.5294', 'grad_norm': '0.1963', 'learning_rate': '6.72e-06', 'ppl': '1.698', 'memory/max_active (GiB)': '72.34', 'memory/max_allocated (GiB)': '72.34', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '81.51', 'tokens/total': 2081278976, 'tokens/trainable': 769240448, 'epoch': '1.879'}
 63%|█████████████████████████████████████████████████████▉                                | 1097/1751 [18:21:18<10:48:48, 59.52s/it] 63%|█████████████████████████████████████████████████████▉                                | 1098/1751 [18:22:18<10:50:13, 59.75s/it]                                                                                                                                     {'loss': '0.5561', 'grad_norm': '0.1904', 'learning_rate': '6.702e-06', 'ppl': '1.744', 'memory/max_active (GiB)': '69.58', 'memory/max_allocated (GiB)': '69.58', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '135.3', 'tokens/total': 2083128320, 'tokens/trainable': 769951168, 'epoch': '1.881'}
 63%|█████████████████████████████████████████████████████▉                                | 1098/1751 [18:22:18<10:50:13, 59.75s/it] 63%|█████████████████████████████████████████████████████▉                                | 1099/1751 [18:23:16<10:41:45, 59.06s/it]                                                                                                                                     {'loss': '0.5502', 'grad_norm': '0.252', 'learning_rate': '6.684e-06', 'ppl': '1.734', 'memory/max_active (GiB)': '71.94', 'memory/max_allocated (GiB)': '71.94', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '111.2', 'tokens/total': 2084886272, 'tokens/trainable': 770622784, 'epoch': '1.882'}
 63%|█████████████████████████████████████████████████████▉                                | 1099/1751 [18:23:16<10:41:45, 59.06s/it] 63%|██████████████████████████████████████████████████████                                | 1100/1751 [18:24:15<10:39:51, 58.97s/it]                                                                                                                                     {'loss': '0.5258', 'grad_norm': '0.1748', 'learning_rate': '6.667e-06', 'ppl': '1.692', 'memory/max_active (GiB)': '75.07', 'memory/max_allocated (GiB)': '75.07', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '40.68', 'tokens/total': 2086748160, 'tokens/trainable': 771338624, 'epoch': '1.884'}
 63%|██████████████████████████████████████████████████████                                | 1100/1751 [18:24:15<10:39:51, 58.97s/it] 63%|██████████████████████████████████████████████████████                                | 1101/1751 [18:25:16<10:45:52, 59.62s/it]                                                                                                                                     {'loss': '0.5646', 'grad_norm': '0.1846', 'learning_rate': '6.649e-06', 'ppl': '1.759', 'memory/max_active (GiB)': '72.49', 'memory/max_allocated (GiB)': '72.49', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '164.1', 'tokens/total': 2088648448, 'tokens/trainable': 772042432, 'epoch': '1.886'}
 63%|██████████████████████████████████████████████████████                                | 1101/1751 [18:25:16<10:45:52, 59.62s/it] 63%|██████████████████████████████████████████████████████                                | 1102/1751 [18:26:18<10:52:41, 60.34s/it]                                                                                                                                     {'loss': '0.5168', 'grad_norm': '0.1709', 'learning_rate': '6.631e-06', 'ppl': '1.677', 'memory/max_active (GiB)': '75.36', 'memory/max_allocated (GiB)': '75.36', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '66.65', 'tokens/total': 2090636032, 'tokens/trainable': 772794112, 'epoch': '1.888'}
 63%|██████████████████████████████████████████████████████                                | 1102/1751 [18:26:18<10:52:41, 60.34s/it] 63%|██████████████████████████████████████████████████████▏                               | 1103/1751 [18:27:17<10:47:05, 59.92s/it]                                                                                                                                     {'loss': '0.5495', 'grad_norm': '0.1787', 'learning_rate': '6.613e-06', 'ppl': '1.732', 'memory/max_active (GiB)': '69.27', 'memory/max_allocated (GiB)': '69.27', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '47.39', 'tokens/total': 2092473856, 'tokens/trainable': 773494784, 'epoch': '1.889'}
 63%|██████████████████████████████████████████████████████▏                               | 1103/1751 [18:27:17<10:47:05, 59.92s/it] 63%|██████████████████████████████████████████████████████▏                               | 1104/1751 [18:28:18<10:52:01, 60.47s/it]                                                                                                                                     {'loss': '0.5091', 'grad_norm': '0.1777', 'learning_rate': '6.596e-06', 'ppl': '1.664', 'memory/max_active (GiB)': '76.62', 'memory/max_allocated (GiB)': '76.62', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '162.9', 'tokens/total': 2094422400, 'tokens/trainable': 774233920, 'epoch': '1.891'}
 63%|██████████████████████████████████████████████████████▏                               | 1104/1751 [18:28:18<10:52:01, 60.47s/it] 63%|██████████████████████████████████████████████████████▎                               | 1105/1751 [18:29:19<10:49:58, 60.37s/it]                                                                                                                                     {'loss': '0.5291', 'grad_norm': '0.1855', 'learning_rate': '6.578e-06', 'ppl': '1.697', 'memory/max_active (GiB)': '72.9', 'memory/max_allocated (GiB)': '72.9', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '60.63', 'tokens/total': 2096377984, 'tokens/trainable': 774960960, 'epoch': '1.893'}
 63%|██████████████████████████████████████████████████████▎                               | 1105/1751 [18:29:19<10:49:58, 60.37s/it] 63%|██████████████████████████████████████████████████████▎                               | 1106/1751 [18:30:20<10:50:58, 60.56s/it]                                                                                                                                     {'loss': '0.4705', 'grad_norm': '0.167', 'learning_rate': '6.56e-06', 'ppl': '1.601', 'memory/max_active (GiB)': '75.11', 'memory/max_allocated (GiB)': '75.11', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '78.14', 'tokens/total': 2098296320, 'tokens/trainable': 775716416, 'epoch': '1.894'}
 63%|██████████████████████████████████████████████████████▎                               | 1106/1751 [18:30:20<10:50:58, 60.56s/it] 63%|██████████████████████████████████████████████████████▎                               | 1107/1751 [18:31:20<10:51:02, 60.66s/it]                                                                                                                                     {'loss': '0.5069', 'grad_norm': '0.1807', 'learning_rate': '6.542e-06', 'ppl': '1.66', 'memory/max_active (GiB)': '73.07', 'memory/max_allocated (GiB)': '73.07', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '113.9', 'tokens/total': 2100253568, 'tokens/trainable': 776441472, 'epoch': '1.896'}
 63%|██████████████████████████████████████████████████████▎                               | 1107/1751 [18:31:20<10:51:02, 60.66s/it] 63%|██████████████████████████████████████████████████████▍                               | 1108/1751 [18:32:20<10:47:56, 60.46s/it]                                                                                                                                     {'loss': '0.5158', 'grad_norm': '0.1797', 'learning_rate': '6.525e-06', 'ppl': '1.675', 'memory/max_active (GiB)': '75.77', 'memory/max_allocated (GiB)': '75.77', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '133.7', 'tokens/total': 2102159744, 'tokens/trainable': 777161024, 'epoch': '1.898'}
 63%|██████████████████████████████████████████████████████▍                               | 1108/1751 [18:32:20<10:47:56, 60.46s/it] 63%|██████████████████████████████████████████████████████▍                               | 1109/1751 [18:33:18<10:38:10, 59.64s/it]                                                                                                                                     {'loss': '0.5215', 'grad_norm': '0.1846', 'learning_rate': '6.507e-06', 'ppl': '1.684', 'memory/max_active (GiB)': '74.53', 'memory/max_allocated (GiB)': '74.53', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '111.1', 'tokens/total': 2103958528, 'tokens/trainable': 777817600, 'epoch': '1.9'}
 63%|██████████████████████████████████████████████████████▍                               | 1109/1751 [18:33:18<10:38:10, 59.64s/it] 63%|██████████████████████████████████████████████████████▌                               | 1110/1751 [18:34:19<10:41:38, 60.06s/it]                                                                                                                                     {'loss': '0.5129', 'grad_norm': '0.1719', 'learning_rate': '6.489e-06', 'ppl': '1.67', 'memory/max_active (GiB)': '74.61', 'memory/max_allocated (GiB)': '74.61', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '48.26', 'tokens/total': 2105907968, 'tokens/trainable': 778566208, 'epoch': '1.901'}
 63%|██████████████████████████████████████████████████████▌                               | 1110/1751 [18:34:19<10:41:38, 60.06s/it] 63%|██████████████████████████████████████████████████████▌                               | 1111/1751 [18:35:18<10:35:13, 59.55s/it]                                                                                                                                     {'loss': '0.4991', 'grad_norm': '0.1855', 'learning_rate': '6.472e-06', 'ppl': '1.647', 'memory/max_active (GiB)': '72.63', 'memory/max_allocated (GiB)': '72.63', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '72.85', 'tokens/total': 2107778432, 'tokens/trainable': 779266432, 'epoch': '1.903'}
 63%|██████████████████████████████████████████████████████▌                               | 1111/1751 [18:35:18<10:35:13, 59.55s/it] 64%|██████████████████████████████████████████████████████▌                               | 1112/1751 [18:36:19<10:41:47, 60.26s/it]                                                                                                                                     {'loss': '0.5125', 'grad_norm': '0.1777', 'learning_rate': '6.454e-06', 'ppl': '1.669', 'memory/max_active (GiB)': '75.63', 'memory/max_allocated (GiB)': '75.63', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '27.21', 'tokens/total': 2109731840, 'tokens/trainable': 779998144, 'epoch': '1.905'}
 64%|██████████████████████████████████████████████████████▌                               | 1112/1751 [18:36:20<10:41:47, 60.26s/it] 64%|██████████████████████████████████████████████████████▋                               | 1113/1751 [18:37:24<10:52:53, 61.40s/it]                                                                                                                                     {'loss': '0.519', 'grad_norm': '0.1738', 'learning_rate': '6.436e-06', 'ppl': '1.68', 'memory/max_active (GiB)': '76.78', 'memory/max_allocated (GiB)': '76.78', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '34.19', 'tokens/total': 2111746176, 'tokens/trainable': 780770816, 'epoch': '1.906'}
 64%|██████████████████████████████████████████████████████▋                               | 1113/1751 [18:37:24<10:52:53, 61.40s/it] 64%|██████████████████████████████████████████████████████▋                               | 1114/1751 [18:38:24<10:49:31, 61.18s/it]                                                                                                                                     {'loss': '0.5034', 'grad_norm': '0.166', 'learning_rate': '6.419e-06', 'ppl': '1.654', 'memory/max_active (GiB)': '73.46', 'memory/max_allocated (GiB)': '73.46', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '38.6', 'tokens/total': 2113631232, 'tokens/trainable': 781489344, 'epoch': '1.908'}
 64%|██████████████████████████████████████████████████████▋                               | 1114/1751 [18:38:24<10:49:31, 61.18s/it] 64%|██████████████████████████████████████████████████████▊                               | 1115/1751 [18:39:21<10:34:53, 59.90s/it]                                                                                                                                     {'loss': '0.5521', 'grad_norm': '0.1943', 'learning_rate': '6.401e-06', 'ppl': '1.737', 'memory/max_active (GiB)': '73.56', 'memory/max_allocated (GiB)': '73.56', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '56.57', 'tokens/total': 2115419008, 'tokens/trainable': 782143552, 'epoch': '1.91'}
 64%|██████████████████████████████████████████████████████▊                               | 1115/1751 [18:39:21<10:34:53, 59.90s/it] 64%|██████████████████████████████████████████████████████▊                               | 1116/1751 [18:40:22<10:36:11, 60.11s/it]                                                                                                                                     {'loss': '0.5211', 'grad_norm': '0.1807', 'learning_rate': '6.383e-06', 'ppl': '1.684', 'memory/max_active (GiB)': '75.74', 'memory/max_allocated (GiB)': '75.74', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '92.5', 'tokens/total': 2117322880, 'tokens/trainable': 782861376, 'epoch': '1.912'}
 64%|██████████████████████████████████████████████████████▊                               | 1116/1751 [18:40:22<10:36:11, 60.11s/it] 64%|██████████████████████████████████████████████████████▊                               | 1117/1751 [18:41:22<10:34:55, 60.09s/it]                                                                                                                                     {'loss': '0.5155', 'grad_norm': '0.1807', 'learning_rate': '6.366e-06', 'ppl': '1.674', 'memory/max_active (GiB)': '73.81', 'memory/max_allocated (GiB)': '73.81', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '169.8', 'tokens/total': 2119219072, 'tokens/trainable': 783565248, 'epoch': '1.913'}
 64%|██████████████████████████████████████████████████████▊                               | 1117/1751 [18:41:22<10:34:55, 60.09s/it] 64%|██████████████████████████████████████████████████████▉                               | 1118/1751 [18:42:21<10:31:34, 59.86s/it]                                                                                                                                     {'loss': '0.5326', 'grad_norm': '0.1836', 'learning_rate': '6.348e-06', 'ppl': '1.703', 'memory/max_active (GiB)': '75.77', 'memory/max_allocated (GiB)': '75.77', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '56.2', 'tokens/total': 2121023744, 'tokens/trainable': 784240640, 'epoch': '1.915'}
 64%|██████████████████████████████████████████████████████▉                               | 1118/1751 [18:42:21<10:31:34, 59.86s/it] 64%|██████████████████████████████████████████████████████▉                               | 1119/1751 [18:43:22<10:33:05, 60.10s/it]                                                                                                                                     {'loss': '0.4931', 'grad_norm': '0.1699', 'learning_rate': '6.331e-06', 'ppl': '1.637', 'memory/max_active (GiB)': '73.3', 'memory/max_allocated (GiB)': '73.3', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '115.4', 'tokens/total': 2122980480, 'tokens/trainable': 784978048, 'epoch': '1.917'}
 64%|██████████████████████████████████████████████████████▉                               | 1119/1751 [18:43:22<10:33:05, 60.10s/it] 64%|███████████████████████████████████████████████████████                               | 1120/1751 [18:44:21<10:30:12, 59.92s/it]                                                                                                                                     {'loss': '0.5214', 'grad_norm': '0.1797', 'learning_rate': '6.313e-06', 'ppl': '1.684', 'memory/max_active (GiB)': '73.37', 'memory/max_allocated (GiB)': '73.37', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '77.41', 'tokens/total': 2124821376, 'tokens/trainable': 785670080, 'epoch': '1.918'}
 64%|███████████████████████████████████████████████████████                               | 1120/1751 [18:44:21<10:30:12, 59.92s/it] 64%|███████████████████████████████████████████████████████                               | 1121/1751 [18:45:20<10:26:53, 59.70s/it]                                                                                                                                     {'loss': '0.5399', 'grad_norm': '0.1836', 'learning_rate': '6.296e-06', 'ppl': '1.716', 'memory/max_active (GiB)': '69.68', 'memory/max_allocated (GiB)': '69.68', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '95.17', 'tokens/total': 2126684160, 'tokens/trainable': 786357056, 'epoch': '1.92'}
 64%|███████████████████████████████████████████████████████                               | 1121/1751 [18:45:20<10:26:53, 59.70s/it] 64%|███████████████████████████████████████████████████████                               | 1122/1751 [18:46:21<10:26:59, 59.81s/it]                                                                                                                                     {'loss': '0.5168', 'grad_norm': '0.1797', 'learning_rate': '6.278e-06', 'ppl': '1.677', 'memory/max_active (GiB)': '71.04', 'memory/max_allocated (GiB)': '71.04', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '140.2', 'tokens/total': 2128620544, 'tokens/trainable': 787066112, 'epoch': '1.922'}
 64%|███████████████████████████████████████████████████████                               | 1122/1751 [18:46:21<10:26:59, 59.81s/it] 64%|███████████████████████████████████████████████████████▏                              | 1123/1751 [18:47:21<10:28:06, 60.01s/it]                                                                                                                                     {'loss': '0.5129', 'grad_norm': '0.1748', 'learning_rate': '6.261e-06', 'ppl': '1.67', 'memory/max_active (GiB)': '76.7', 'memory/max_allocated (GiB)': '76.7', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '41.53', 'tokens/total': 2130527744, 'tokens/trainable': 787767296, 'epoch': '1.924'}
 64%|███████████████████████████████████████████████████████▏                              | 1123/1751 [18:47:21<10:28:06, 60.01s/it] 64%|███████████████████████████████████████████████████████▏                              | 1124/1751 [18:48:23<10:32:29, 60.53s/it]                                                                                                                                     {'loss': '0.4889', 'grad_norm': '0.1768', 'learning_rate': '6.243e-06', 'ppl': '1.63', 'memory/max_active (GiB)': '75.61', 'memory/max_allocated (GiB)': '75.61', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '111.7', 'tokens/total': 2132475648, 'tokens/trainable': 788481344, 'epoch': '1.925'}
 64%|███████████████████████████████████████████████████████▏                              | 1124/1751 [18:48:23<10:32:29, 60.53s/it] 64%|███████████████████████████████████████████████████████▎                              | 1125/1751 [18:49:22<10:26:45, 60.07s/it]                                                                                                                                     {'loss': '0.5384', 'grad_norm': '0.1943', 'learning_rate': '6.226e-06', 'ppl': '1.713', 'memory/max_active (GiB)': '68.88', 'memory/max_allocated (GiB)': '68.88', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '56.92', 'tokens/total': 2134332160, 'tokens/trainable': 789152256, 'epoch': '1.927'}
 64%|███████████████████████████████████████████████████████▎                              | 1125/1751 [18:49:22<10:26:45, 60.07s/it] 64%|███████████████████████████████████████████████████████▎                              | 1126/1751 [18:50:22<10:26:21, 60.13s/it]                                                                                                                                     {'loss': '0.5199', 'grad_norm': '0.1797', 'learning_rate': '6.208e-06', 'ppl': '1.682', 'memory/max_active (GiB)': '76.26', 'memory/max_allocated (GiB)': '76.26', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '171.9', 'tokens/total': 2136248832, 'tokens/trainable': 789852352, 'epoch': '1.929'}
 64%|███████████████████████████████████████████████████████▎                              | 1126/1751 [18:50:22<10:26:21, 60.13s/it] 64%|███████████████████████████████████████████████████████▎                              | 1127/1751 [18:51:21<10:20:30, 59.66s/it]                                                                                                                                     {'loss': '0.5277', 'grad_norm': '0.1826', 'learning_rate': '6.191e-06', 'ppl': '1.695', 'memory/max_active (GiB)': '73.98', 'memory/max_allocated (GiB)': '73.98', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '86.19', 'tokens/total': 2138083712, 'tokens/trainable': 790508672, 'epoch': '1.93'}
 64%|███████████████████████████████████████████████████████▎                              | 1127/1751 [18:51:21<10:20:30, 59.66s/it] 64%|███████████████████████████████████████████████████████▍                              | 1128/1751 [18:52:19<10:15:21, 59.26s/it]                                                                                                                                     {'loss': '0.5261', 'grad_norm': '0.1836', 'learning_rate': '6.173e-06', 'ppl': '1.692', 'memory/max_active (GiB)': '74.93', 'memory/max_allocated (GiB)': '74.93', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '164.5', 'tokens/total': 2139942784, 'tokens/trainable': 791212480, 'epoch': '1.932'}
 64%|███████████████████████████████████████████████████████▍                              | 1128/1751 [18:52:19<10:15:21, 59.26s/it] 64%|███████████████████████████████████████████████████████▍                              | 1129/1751 [18:53:18<10:14:21, 59.26s/it]                                                                                                                                     {'loss': '0.4977', 'grad_norm': '0.1797', 'learning_rate': '6.156e-06', 'ppl': '1.645', 'memory/max_active (GiB)': '71.38', 'memory/max_allocated (GiB)': '71.38', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '91.05', 'tokens/total': 2141862144, 'tokens/trainable': 791896512, 'epoch': '1.934'}
 64%|███████████████████████████████████████████████████████▍                              | 1129/1751 [18:53:18<10:14:21, 59.26s/it] 65%|███████████████████████████████████████████████████████▍                              | 1130/1751 [18:54:18<10:16:12, 59.54s/it]                                                                                                                                     {'loss': '0.5166', 'grad_norm': '0.168', 'learning_rate': '6.138e-06', 'ppl': '1.676', 'memory/max_active (GiB)': '76.84', 'memory/max_allocated (GiB)': '76.84', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '64.84', 'tokens/total': 2143806336, 'tokens/trainable': 792637888, 'epoch': '1.936'}
 65%|███████████████████████████████████████████████████████▍                              | 1130/1751 [18:54:18<10:16:12, 59.54s/it] 65%|███████████████████████████████████████████████████████▌                              | 1131/1751 [18:55:17<10:12:56, 59.32s/it]                                                                                                                                     {'loss': '0.5151', 'grad_norm': '0.1865', 'learning_rate': '6.121e-06', 'ppl': '1.674', 'memory/max_active (GiB)': '74.77', 'memory/max_allocated (GiB)': '74.77', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '80', 'tokens/total': 2145649152, 'tokens/trainable': 793314560, 'epoch': '1.937'}
 65%|███████████████████████████████████████████████████████▌                              | 1131/1751 [18:55:17<10:12:56, 59.32s/it] 65%|███████████████████████████████████████████████████████▌                              | 1132/1751 [18:56:15<10:07:03, 58.84s/it]                                                                                                                                     {'loss': '0.521', 'grad_norm': '0.1934', 'learning_rate': '6.104e-06', 'ppl': '1.684', 'memory/max_active (GiB)': '77.54', 'memory/max_allocated (GiB)': '77.54', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '86.23', 'tokens/total': 2147464704, 'tokens/trainable': 793950336, 'epoch': '1.939'}
 65%|███████████████████████████████████████████████████████▌                              | 1132/1751 [18:56:15<10:07:03, 58.84s/it] 65%|███████████████████████████████████████████████████████▋                              | 1133/1751 [18:57:15<10:10:51, 59.31s/it]                                                                                                                                     {'loss': '0.4911', 'grad_norm': '0.1758', 'learning_rate': '6.086e-06', 'ppl': '1.634', 'memory/max_active (GiB)': '73.76', 'memory/max_allocated (GiB)': '73.76', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '161.8', 'tokens/total': 2149416960, 'tokens/trainable': 794670848, 'epoch': '1.941'}
 65%|███████████████████████████████████████████████████████▋                              | 1133/1751 [18:57:15<10:10:51, 59.31s/it] 65%|███████████████████████████████████████████████████████▋                              | 1134/1751 [18:58:16<10:14:58, 59.80s/it]                                                                                                                                     {'loss': '0.5084', 'grad_norm': '0.1787', 'learning_rate': '6.069e-06', 'ppl': '1.663', 'memory/max_active (GiB)': '76.42', 'memory/max_allocated (GiB)': '76.42', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '190.6', 'tokens/total': 2151379712, 'tokens/trainable': 795373568, 'epoch': '1.942'}
 65%|███████████████████████████████████████████████████████▋                              | 1134/1751 [18:58:16<10:14:58, 59.80s/it] 65%|███████████████████████████████████████████████████████▋                              | 1135/1751 [18:59:19<10:22:21, 60.62s/it]                                                                                                                                     {'loss': '0.5026', 'grad_norm': '0.1797', 'learning_rate': '6.051e-06', 'ppl': '1.653', 'memory/max_active (GiB)': '74.8', 'memory/max_allocated (GiB)': '74.8', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '95.1', 'tokens/total': 2153394944, 'tokens/trainable': 796126656, 'epoch': '1.944'}
 65%|███████████████████████████████████████████████████████▋                              | 1135/1751 [18:59:19<10:22:21, 60.62s/it] 65%|███████████████████████████████████████████████████████▊                              | 1136/1751 [19:00:16<10:12:21, 59.74s/it]                                                                                                                                     {'loss': '0.5572', 'grad_norm': '0.1826', 'learning_rate': '6.034e-06', 'ppl': '1.746', 'memory/max_active (GiB)': '73.18', 'memory/max_allocated (GiB)': '73.18', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '58.62', 'tokens/total': 2155211520, 'tokens/trainable': 796849024, 'epoch': '1.946'}
 65%|███████████████████████████████████████████████████████▊                              | 1136/1751 [19:00:16<10:12:21, 59.74s/it] 65%|███████████████████████████████████████████████████████▊                              | 1137/1751 [19:01:14<10:05:08, 59.13s/it]                                                                                                                                     {'loss': '0.556', 'grad_norm': '0.1885', 'learning_rate': '6.017e-06', 'ppl': '1.744', 'memory/max_active (GiB)': '72.16', 'memory/max_allocated (GiB)': '72.16', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '131', 'tokens/total': 2157027584, 'tokens/trainable': 797524352, 'epoch': '1.948'}
 65%|███████████████████████████████████████████████████████▊                              | 1137/1751 [19:01:14<10:05:08, 59.13s/it] 65%|███████████████████████████████████████████████████████▉                              | 1138/1751 [19:02:16<10:12:07, 59.91s/it]                                                                                                                                     {'loss': '0.5144', 'grad_norm': '0.1846', 'learning_rate': '5.999e-06', 'ppl': '1.673', 'memory/max_active (GiB)': '77.18', 'memory/max_allocated (GiB)': '77.18', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '71.75', 'tokens/total': 2158999040, 'tokens/trainable': 798263488, 'epoch': '1.949'}
 65%|███████████████████████████████████████████████████████▉                              | 1138/1751 [19:02:16<10:12:07, 59.91s/it] 65%|███████████████████████████████████████████████████████▉                              | 1139/1751 [19:03:15<10:09:33, 59.76s/it]                                                                                                                                     {'loss': '0.5293', 'grad_norm': '0.1934', 'learning_rate': '5.982e-06', 'ppl': '1.698', 'memory/max_active (GiB)': '76.03', 'memory/max_allocated (GiB)': '76.03', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '89.33', 'tokens/total': 2160884224, 'tokens/trainable': 798974720, 'epoch': '1.951'}
 65%|███████████████████████████████████████████████████████▉                              | 1139/1751 [19:03:15<10:09:33, 59.76s/it] 65%|███████████████████████████████████████████████████████▉                              | 1140/1751 [19:04:13<10:02:06, 59.13s/it]                                                                                                                                     {'loss': '0.5281', 'grad_norm': '0.1836', 'learning_rate': '5.965e-06', 'ppl': '1.696', 'memory/max_active (GiB)': '73.32', 'memory/max_allocated (GiB)': '73.32', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '42.38', 'tokens/total': 2162720768, 'tokens/trainable': 799635200, 'epoch': '1.953'}
 65%|███████████████████████████████████████████████████████▉                              | 1140/1751 [19:04:13<10:02:06, 59.13s/it] 65%|████████████████████████████████████████████████████████                              | 1141/1751 [19:05:16<10:13:18, 60.33s/it]                                                                                                                                     {'loss': '0.4763', 'grad_norm': '0.1611', 'learning_rate': '5.948e-06', 'ppl': '1.61', 'memory/max_active (GiB)': '76.92', 'memory/max_allocated (GiB)': '76.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '70.31', 'tokens/total': 2164764416, 'tokens/trainable': 800426496, 'epoch': '1.954'}
 65%|████████████████████████████████████████████████████████                              | 1141/1751 [19:05:16<10:13:18, 60.33s/it] 65%|████████████████████████████████████████████████████████                              | 1142/1751 [19:06:17<10:14:18, 60.52s/it]                                                                                                                                     {'loss': '0.5097', 'grad_norm': '0.168', 'learning_rate': '5.93e-06', 'ppl': '1.665', 'memory/max_active (GiB)': '72.23', 'memory/max_allocated (GiB)': '72.23', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '143.6', 'tokens/total': 2166721536, 'tokens/trainable': 801153024, 'epoch': '1.956'}
 65%|████████████████████████████████████████████████████████                              | 1142/1751 [19:06:17<10:14:18, 60.52s/it] 65%|████████████████████████████████████████████████████████▏                             | 1143/1751 [19:07:17<10:11:31, 60.35s/it]                                                                                                                                     {'loss': '0.5351', 'grad_norm': '0.1768', 'learning_rate': '5.913e-06', 'ppl': '1.708', 'memory/max_active (GiB)': '73.55', 'memory/max_allocated (GiB)': '73.55', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '44.68', 'tokens/total': 2168670976, 'tokens/trainable': 801906368, 'epoch': '1.958'}
 65%|████████████████████████████████████████████████████████▏                             | 1143/1751 [19:07:17<10:11:31, 60.35s/it] 65%|████████████████████████████████████████████████████████▏                             | 1144/1751 [19:08:16<10:05:58, 59.90s/it]                                                                                                                                     {'loss': '0.5338', 'grad_norm': '0.1865', 'learning_rate': '5.896e-06', 'ppl': '1.705', 'memory/max_active (GiB)': '75.29', 'memory/max_allocated (GiB)': '75.29', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '35.15', 'tokens/total': 2170555904, 'tokens/trainable': 802606528, 'epoch': '1.96'}
 65%|████████████████████████████████████████████████████████▏                             | 1144/1751 [19:08:16<10:05:58, 59.90s/it] 65%|████████████████████████████████████████████████████████▏                             | 1145/1751 [19:09:18<10:10:54, 60.49s/it]                                                                                                                                     {'loss': '0.5322', 'grad_norm': '0.1699', 'learning_rate': '5.879e-06', 'ppl': '1.703', 'memory/max_active (GiB)': '71.38', 'memory/max_allocated (GiB)': '71.38', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '134.4', 'tokens/total': 2172495360, 'tokens/trainable': 803345088, 'epoch': '1.961'}
 65%|████████████████████████████████████████████████████████▏                             | 1145/1751 [19:09:18<10:10:54, 60.49s/it] 65%|████████████████████████████████████████████████████████▎                             | 1146/1751 [19:10:17<10:06:22, 60.14s/it]                                                                                                                                     {'loss': '0.559', 'grad_norm': '0.1963', 'learning_rate': '5.861e-06', 'ppl': '1.749', 'memory/max_active (GiB)': '72.77', 'memory/max_allocated (GiB)': '72.77', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '146.7', 'tokens/total': 2174358016, 'tokens/trainable': 803998016, 'epoch': '1.963'}
 65%|████████████████████████████████████████████████████████▎                             | 1146/1751 [19:10:17<10:06:22, 60.14s/it] 66%|████████████████████████████████████████████████████████▎                             | 1147/1751 [19:11:16<10:02:11, 59.82s/it]                                                                                                                                     {'loss': '0.5063', 'grad_norm': '0.1885', 'learning_rate': '5.844e-06', 'ppl': '1.659', 'memory/max_active (GiB)': '74.1', 'memory/max_allocated (GiB)': '74.1', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '48.48', 'tokens/total': 2176176384, 'tokens/trainable': 804638208, 'epoch': '1.965'}
 66%|████████████████████████████████████████████████████████▎                             | 1147/1751 [19:11:16<10:02:11, 59.82s/it] 66%|████████████████████████████████████████████████████████▍                             | 1148/1751 [19:12:17<10:03:44, 60.07s/it]                                                                                                                                     {'loss': '0.5221', 'grad_norm': '0.1846', 'learning_rate': '5.827e-06', 'ppl': '1.686', 'memory/max_active (GiB)': '66.84', 'memory/max_allocated (GiB)': '66.84', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '126.8', 'tokens/total': 2178067968, 'tokens/trainable': 805332800, 'epoch': '1.966'}
 66%|████████████████████████████████████████████████████████▍                             | 1148/1751 [19:12:17<10:03:44, 60.07s/it] 66%|████████████████████████████████████████████████████████▍                             | 1149/1751 [19:13:18<10:05:04, 60.31s/it]                                                                                                                                     {'loss': '0.5023', 'grad_norm': '0.1768', 'learning_rate': '5.81e-06', 'ppl': '1.652', 'memory/max_active (GiB)': '71.17', 'memory/max_allocated (GiB)': '71.17', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '112.4', 'tokens/total': 2179956992, 'tokens/trainable': 806030784, 'epoch': '1.968'}
 66%|████████████████████████████████████████████████████████▍                             | 1149/1751 [19:13:18<10:05:04, 60.31s/it] 66%|████████████████████████████████████████████████████████▍                             | 1150/1751 [19:14:18<10:03:23, 60.24s/it]                                                                                                                                     {'loss': '0.5262', 'grad_norm': '0.1787', 'learning_rate': '5.793e-06', 'ppl': '1.692', 'memory/max_active (GiB)': '74.33', 'memory/max_allocated (GiB)': '74.33', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '124.2', 'tokens/total': 2181846016, 'tokens/trainable': 806717312, 'epoch': '1.97'}
 66%|████████████████████████████████████████████████████████▍                             | 1150/1751 [19:14:18<10:03:23, 60.24s/it] 66%|████████████████████████████████████████████████████████▌                             | 1151/1751 [19:15:19<10:05:54, 60.59s/it]                                                                                                                                     {'loss': '0.5063', 'grad_norm': '0.1709', 'learning_rate': '5.776e-06', 'ppl': '1.659', 'memory/max_active (GiB)': '74.66', 'memory/max_allocated (GiB)': '74.66', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '59.8', 'tokens/total': 2183835648, 'tokens/trainable': 807440704, 'epoch': '1.972'}
 66%|████████████████████████████████████████████████████████▌                             | 1151/1751 [19:15:19<10:05:54, 60.59s/it] 66%|█████████████████████████████████████████████████████████▏                             | 1152/1751 [19:16:16<9:54:43, 59.57s/it]                                                                                                                                     {'loss': '0.5239', 'grad_norm': '0.1855', 'learning_rate': '5.759e-06', 'ppl': '1.689', 'memory/max_active (GiB)': '73', 'memory/max_allocated (GiB)': '73', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '60.65', 'tokens/total': 2185634560, 'tokens/trainable': 808087680, 'epoch': '1.973'}
 66%|█████████████████████████████████████████████████████████▏                             | 1152/1751 [19:16:16<9:54:43, 59.57s/it] 66%|████████████████████████████████████████████████████████▋                             | 1153/1751 [19:17:18<10:00:54, 60.29s/it]                                                                                                                                     {'loss': '0.4867', 'grad_norm': '0.1816', 'learning_rate': '5.742e-06', 'ppl': '1.627', 'memory/max_active (GiB)': '75.91', 'memory/max_allocated (GiB)': '75.91', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '90.29', 'tokens/total': 2187622144, 'tokens/trainable': 808814464, 'epoch': '1.975'}
 66%|████████████████████████████████████████████████████████▋                             | 1153/1751 [19:17:18<10:00:54, 60.29s/it] 66%|█████████████████████████████████████████████████████████▎                             | 1154/1751 [19:18:18<9:57:02, 60.00s/it]                                                                                                                                     {'loss': '0.5227', 'grad_norm': '0.1943', 'learning_rate': '5.724e-06', 'ppl': '1.687', 'memory/max_active (GiB)': '77.54', 'memory/max_allocated (GiB)': '77.54', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '97.62', 'tokens/total': 2189528832, 'tokens/trainable': 809521088, 'epoch': '1.977'}
 66%|█████████████████████████████████████████████████████████▎                             | 1154/1751 [19:18:18<9:57:02, 60.00s/it] 66%|█████████████████████████████████████████████████████████▍                             | 1155/1751 [19:19:15<9:47:01, 59.10s/it]                                                                                                                                     {'loss': '0.566', 'grad_norm': '0.1982', 'learning_rate': '5.707e-06', 'ppl': '1.761', 'memory/max_active (GiB)': '69.49', 'memory/max_allocated (GiB)': '69.49', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '76.74', 'tokens/total': 2191340032, 'tokens/trainable': 810172480, 'epoch': '1.978'}
 66%|█████████████████████████████████████████████████████████▍                             | 1155/1751 [19:19:15<9:47:01, 59.10s/it] 66%|█████████████████████████████████████████████████████████▍                             | 1156/1751 [19:20:15<9:50:13, 59.52s/it]                                                                                                                                     {'loss': '0.5015', 'grad_norm': '0.1904', 'learning_rate': '5.69e-06', 'ppl': '1.651', 'memory/max_active (GiB)': '72.38', 'memory/max_allocated (GiB)': '72.38', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '126.2', 'tokens/total': 2193288704, 'tokens/trainable': 810916224, 'epoch': '1.98'}
 66%|█████████████████████████████████████████████████████████▍                             | 1156/1751 [19:20:15<9:50:13, 59.52s/it] 66%|█████████████████████████████████████████████████████████▍                             | 1157/1751 [19:21:17<9:56:18, 60.23s/it]                                                                                                                                     {'loss': '0.5027', 'grad_norm': '0.1689', 'learning_rate': '5.673e-06', 'ppl': '1.653', 'memory/max_active (GiB)': '73.37', 'memory/max_allocated (GiB)': '73.37', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '26.48', 'tokens/total': 2195293184, 'tokens/trainable': 811653184, 'epoch': '1.982'}
 66%|█████████████████████████████████████████████████████████▍                             | 1157/1751 [19:21:17<9:56:18, 60.23s/it] 66%|█████████████████████████████████████████████████████████▌                             | 1158/1751 [19:22:19<9:59:35, 60.67s/it]                                                                                                                                     {'loss': '0.4999', 'grad_norm': '0.1738', 'learning_rate': '5.656e-06', 'ppl': '1.649', 'memory/max_active (GiB)': '71.22', 'memory/max_allocated (GiB)': '71.22', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '87.94', 'tokens/total': 2197268992, 'tokens/trainable': 812412352, 'epoch': '1.984'}
 66%|█████████████████████████████████████████████████████████▌                             | 1158/1751 [19:22:19<9:59:35, 60.67s/it] 66%|████████████████████████████████████████████████████████▉                             | 1159/1751 [19:23:20<10:00:21, 60.85s/it]                                                                                                                                     {'loss': '0.529', 'grad_norm': '0.1865', 'learning_rate': '5.639e-06', 'ppl': '1.697', 'memory/max_active (GiB)': '70.06', 'memory/max_allocated (GiB)': '70.06', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '68.97', 'tokens/total': 2199253504, 'tokens/trainable': 813138368, 'epoch': '1.985'}
 66%|████████████████████████████████████████████████████████▉                             | 1159/1751 [19:23:20<10:00:21, 60.85s/it] 66%|████████████████████████████████████████████████████████▉                             | 1160/1751 [19:24:22<10:02:13, 61.14s/it]                                                                                                                                     {'loss': '0.4963', 'grad_norm': '0.1699', 'learning_rate': '5.622e-06', 'ppl': '1.643', 'memory/max_active (GiB)': '75.92', 'memory/max_allocated (GiB)': '75.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '193.3', 'tokens/total': 2201256192, 'tokens/trainable': 813879232, 'epoch': '1.987'}
 66%|████████████████████████████████████████████████████████▉                             | 1160/1751 [19:24:22<10:02:13, 61.14s/it] 66%|█████████████████████████████████████████████████████████▋                             | 1161/1751 [19:25:22<9:57:58, 60.81s/it]                                                                                                                                     {'loss': '0.4941', 'grad_norm': '0.1748', 'learning_rate': '5.605e-06', 'ppl': '1.639', 'memory/max_active (GiB)': '77.28', 'memory/max_allocated (GiB)': '77.28', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '59.3', 'tokens/total': 2203200512, 'tokens/trainable': 814596480, 'epoch': '1.989'}
 66%|█████████████████████████████████████████████████████████▋                             | 1161/1751 [19:25:22<9:57:58, 60.81s/it] 66%|█████████████████████████████████████████████████████████▋                             | 1162/1751 [19:26:21<9:53:34, 60.47s/it]                                                                                                                                     {'loss': '0.5224', 'grad_norm': '0.1807', 'learning_rate': '5.588e-06', 'ppl': '1.686', 'memory/max_active (GiB)': '76.77', 'memory/max_allocated (GiB)': '76.77', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '168', 'tokens/total': 2205114880, 'tokens/trainable': 815295104, 'epoch': '1.99'}
 66%|█████████████████████████████████████████████████████████▋                             | 1162/1751 [19:26:21<9:53:34, 60.47s/it] 66%|█████████████████████████████████████████████████████████▊                             | 1163/1751 [19:27:22<9:51:42, 60.38s/it]                                                                                                                                     {'loss': '0.5181', 'grad_norm': '0.1768', 'learning_rate': '5.571e-06', 'ppl': '1.679', 'memory/max_active (GiB)': '73.3', 'memory/max_allocated (GiB)': '73.3', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '7.75', 'tokens/total': 2207067136, 'tokens/trainable': 816017408, 'epoch': '1.992'}
 66%|█████████████████████████████████████████████████████████▊                             | 1163/1751 [19:27:22<9:51:42, 60.38s/it] 66%|█████████████████████████████████████████████████████████▊                             | 1164/1751 [19:28:24<9:56:01, 60.92s/it]                                                                                                                                     {'loss': '0.5408', 'grad_norm': '0.1797', 'learning_rate': '5.555e-06', 'ppl': '1.717', 'memory/max_active (GiB)': '72.02', 'memory/max_allocated (GiB)': '72.02', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '89.02', 'tokens/total': 2209042688, 'tokens/trainable': 816748288, 'epoch': '1.994'}
 66%|█████████████████████████████████████████████████████████▊                             | 1164/1751 [19:28:24<9:56:01, 60.92s/it] 67%|█████████████████████████████████████████████████████████▉                             | 1165/1751 [19:29:23<9:48:51, 60.29s/it]                                                                                                                                     {'loss': '0.5281', 'grad_norm': '0.1885', 'learning_rate': '5.538e-06', 'ppl': '1.696', 'memory/max_active (GiB)': '75.9', 'memory/max_allocated (GiB)': '75.9', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '80.09', 'tokens/total': 2210885888, 'tokens/trainable': 817426176, 'epoch': '1.996'}
 67%|█████████████████████████████████████████████████████████▉                             | 1165/1751 [19:29:23<9:48:51, 60.29s/it] 67%|█████████████████████████████████████████████████████████▉                             | 1166/1751 [19:30:23<9:47:30, 60.26s/it]                                                                                                                                     {'loss': '0.5219', 'grad_norm': '0.1963', 'learning_rate': '5.521e-06', 'ppl': '1.685', 'memory/max_active (GiB)': '74.85', 'memory/max_allocated (GiB)': '74.85', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '41.6', 'tokens/total': 2212790784, 'tokens/trainable': 818130112, 'epoch': '1.997'}
 67%|█████████████████████████████████████████████████████████▉                             | 1166/1751 [19:30:23<9:47:30, 60.26s/it] 67%|█████████████████████████████████████████████████████████▉                             | 1167/1751 [19:31:23<9:46:34, 60.26s/it]                                                                                                                                     {'loss': '0.49', 'grad_norm': '0.1777', 'learning_rate': '5.504e-06', 'ppl': '1.632', 'memory/max_active (GiB)': '75.58', 'memory/max_allocated (GiB)': '75.58', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '61.21', 'tokens/total': 2214704128, 'tokens/trainable': 818804480, 'epoch': '1.999'}
 67%|█████████████████████████████████████████████████████████▉                             | 1167/1751 [19:31:23<9:46:34, 60.26s/it] 67%|██████████████████████████████████████████████████████████                             | 1168/1751 [19:32:00<8:36:54, 53.20s/it]                                                                                                                                     {'loss': '0.5257', 'grad_norm': '0.2188', 'learning_rate': '5.487e-06', 'ppl': '1.692', 'memory/max_active (GiB)': '72.25', 'memory/max_allocated (GiB)': '72.25', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '166', 'tokens/total': 2215850240, 'tokens/trainable': 819243136, 'epoch': '2'}
 67%|██████████████████████████████████████████████████████████                             | 1168/1751 [19:32:00<8:36:54, 53.20s/it][2026-02-04 22:55:27,557] [INFO] [axolotl.core.trainers.base._save:721] [PID:23602] Saving model checkpoint to ./outputs/checkpoint-1168

Writing model shards:   0%|                                                                                    | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|████████████████████████████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.78s/it][AWriting model shards: 100%|████████████████████████████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.78s/it]
 67%|█████████████████████████████████████████████████████████▍                            | 1169/1751 [19:33:33<10:32:37, 65.22s/it]                                                                                                                                     {'loss': '0.547', 'grad_norm': '0.1924', 'learning_rate': '5.47e-06', 'ppl': '1.728', 'memory/max_active (GiB)': '72.82', 'memory/max_allocated (GiB)': '72.82', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '26.83', 'tokens/total': 2217654272, 'tokens/trainable': 819867904, 'epoch': '2.002'}
 67%|█████████████████████████████████████████████████████████▍                            | 1169/1751 [19:33:33<10:32:37, 65.22s/it] 67%|█████████████████████████████████████████████████████████▍                            | 1170/1751 [19:34:31<10:09:50, 62.98s/it]                                                                                                                                     {'loss': '0.5771', 'grad_norm': '0.1895', 'learning_rate': '5.453e-06', 'ppl': '1.781', 'memory/max_active (GiB)': '69.8', 'memory/max_allocated (GiB)': '69.8', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '29.97', 'tokens/total': 2219422720, 'tokens/trainable': 820529024, 'epoch': '2.003'}
 67%|█████████████████████████████████████████████████████████▍                            | 1170/1751 [19:34:31<10:09:50, 62.98s/it] 67%|██████████████████████████████████████████████████████████▏                            | 1171/1751 [19:35:31<9:59:16, 61.99s/it]                                                                                                                                     {'loss': '0.5195', 'grad_norm': '0.1777', 'learning_rate': '5.437e-06', 'ppl': '1.681', 'memory/max_active (GiB)': '68.59', 'memory/max_allocated (GiB)': '68.59', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '51.61', 'tokens/total': 2221291008, 'tokens/trainable': 821223168, 'epoch': '2.005'}
 67%|██████████████████████████████████████████████████████████▏                            | 1171/1751 [19:35:31<9:59:16, 61.99s/it] 67%|██████████████████████████████████████████████████████████▏                            | 1172/1751 [19:36:31<9:54:21, 61.59s/it]                                                                                                                                     {'loss': '0.5024', 'grad_norm': '0.1709', 'learning_rate': '5.42e-06', 'ppl': '1.653', 'memory/max_active (GiB)': '69.26', 'memory/max_allocated (GiB)': '69.26', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '65.58', 'tokens/total': 2223200256, 'tokens/trainable': 821959616, 'epoch': '2.007'}
 67%|██████████████████████████████████████████████████████████▏                            | 1172/1751 [19:36:31<9:54:21, 61.59s/it] 67%|██████████████████████████████████████████████████████████▎                            | 1173/1751 [19:37:28<9:39:34, 60.16s/it]                                                                                                                                     {'loss': '0.5169', 'grad_norm': '0.1836', 'learning_rate': '5.403e-06', 'ppl': '1.677', 'memory/max_active (GiB)': '76.5', 'memory/max_allocated (GiB)': '76.5', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '43.27', 'tokens/total': 2224961792, 'tokens/trainable': 822608704, 'epoch': '2.009'}
 67%|██████████████████████████████████████████████████████████▎                            | 1173/1751 [19:37:28<9:39:34, 60.16s/it] 67%|██████████████████████████████████████████████████████████▎                            | 1174/1751 [19:38:28<9:37:52, 60.09s/it]                                                                                                                                     {'loss': '0.5173', 'grad_norm': '0.1875', 'learning_rate': '5.386e-06', 'ppl': '1.677', 'memory/max_active (GiB)': '75.74', 'memory/max_allocated (GiB)': '75.74', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '77.67', 'tokens/total': 2226857728, 'tokens/trainable': 823307072, 'epoch': '2.01'}
 67%|██████████████████████████████████████████████████████████▎                            | 1174/1751 [19:38:28<9:37:52, 60.09s/it] 67%|██████████████████████████████████████████████████████████▍                            | 1175/1751 [19:39:29<9:38:22, 60.25s/it]                                                                                                                                     {'loss': '0.5336', 'grad_norm': '0.1738', 'learning_rate': '5.369e-06', 'ppl': '1.705', 'memory/max_active (GiB)': '75.12', 'memory/max_allocated (GiB)': '75.12', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '53.69', 'tokens/total': 2228771584, 'tokens/trainable': 824066496, 'epoch': '2.012'}
 67%|██████████████████████████████████████████████████████████▍                            | 1175/1751 [19:39:29<9:38:22, 60.25s/it] 67%|██████████████████████████████████████████████████████████▍                            | 1176/1751 [19:40:28<9:35:43, 60.08s/it]                                                                                                                                     {'loss': '0.5217', 'grad_norm': '0.1836', 'learning_rate': '5.353e-06', 'ppl': '1.685', 'memory/max_active (GiB)': '75.33', 'memory/max_allocated (GiB)': '75.33', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '80.62', 'tokens/total': 2230659072, 'tokens/trainable': 824762176, 'epoch': '2.014'}
 67%|██████████████████████████████████████████████████████████▍                            | 1176/1751 [19:40:28<9:35:43, 60.08s/it] 67%|██████████████████████████████████████████████████████████▍                            | 1177/1751 [19:41:30<9:39:11, 60.54s/it]                                                                                                                                     {'loss': '0.4929', 'grad_norm': '0.1826', 'learning_rate': '5.336e-06', 'ppl': '1.637', 'memory/max_active (GiB)': '76.93', 'memory/max_allocated (GiB)': '76.93', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '80.67', 'tokens/total': 2232634368, 'tokens/trainable': 825486592, 'epoch': '2.015'}
 67%|██████████████████████████████████████████████████████████▍                            | 1177/1751 [19:41:30<9:39:11, 60.54s/it] 67%|██████████████████████████████████████████████████████████▌                            | 1178/1751 [19:42:31<9:40:38, 60.80s/it]                                                                                                                                     {'loss': '0.4874', 'grad_norm': '0.1758', 'learning_rate': '5.319e-06', 'ppl': '1.628', 'memory/max_active (GiB)': '74.26', 'memory/max_allocated (GiB)': '74.26', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '128.3', 'tokens/total': 2234508288, 'tokens/trainable': 826192512, 'epoch': '2.017'}
 67%|██████████████████████████████████████████████████████████▌                            | 1178/1751 [19:42:31<9:40:38, 60.80s/it] 67%|██████████████████████████████████████████████████████████▌                            | 1179/1751 [19:43:31<9:36:41, 60.49s/it]                                                                                                                                     {'loss': '0.5108', 'grad_norm': '0.1816', 'learning_rate': '5.303e-06', 'ppl': '1.667', 'memory/max_active (GiB)': '73.46', 'memory/max_allocated (GiB)': '73.46', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '176.9', 'tokens/total': 2236368384, 'tokens/trainable': 826867776, 'epoch': '2.019'}
 67%|██████████████████████████████████████████████████████████▌                            | 1179/1751 [19:43:31<9:36:41, 60.49s/it] 67%|██████████████████████████████████████████████████████████▋                            | 1180/1751 [19:44:31<9:34:38, 60.38s/it]                                                                                                                                     {'loss': '0.4903', 'grad_norm': '0.1738', 'learning_rate': '5.286e-06', 'ppl': '1.633', 'memory/max_active (GiB)': '71.75', 'memory/max_allocated (GiB)': '71.75', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '79.95', 'tokens/total': 2238269184, 'tokens/trainable': 827569344, 'epoch': '2.021'}
 67%|██████████████████████████████████████████████████████████▋                            | 1180/1751 [19:44:31<9:34:38, 60.38s/it] 67%|██████████████████████████████████████████████████████████▋                            | 1181/1751 [19:45:34<9:39:22, 60.99s/it]                                                                                                                                     {'loss': '0.4748', 'grad_norm': '0.1689', 'learning_rate': '5.269e-06', 'ppl': '1.608', 'memory/max_active (GiB)': '75.18', 'memory/max_allocated (GiB)': '75.18', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '99.2', 'tokens/total': 2240308736, 'tokens/trainable': 828300928, 'epoch': '2.022'}
 67%|██████████████████████████████████████████████████████████▋                            | 1181/1751 [19:45:34<9:39:22, 60.99s/it] 68%|██████████████████████████████████████████████████████████▋                            | 1182/1751 [19:46:34<9:37:30, 60.90s/it]                                                                                                                                     {'loss': '0.5477', 'grad_norm': '0.1836', 'learning_rate': '5.253e-06', 'ppl': '1.729', 'memory/max_active (GiB)': '75.92', 'memory/max_allocated (GiB)': '75.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '110.6', 'tokens/total': 2242236928, 'tokens/trainable': 829021760, 'epoch': '2.024'}
 68%|██████████████████████████████████████████████████████████▋                            | 1182/1751 [19:46:34<9:37:30, 60.90s/it] 68%|██████████████████████████████████████████████████████████▊                            | 1183/1751 [19:47:35<9:35:15, 60.77s/it]                                                                                                                                     {'loss': '0.4795', 'grad_norm': '0.1729', 'learning_rate': '5.236e-06', 'ppl': '1.615', 'memory/max_active (GiB)': '70.33', 'memory/max_allocated (GiB)': '70.33', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '33.04', 'tokens/total': 2244164096, 'tokens/trainable': 829742272, 'epoch': '2.026'}
 68%|██████████████████████████████████████████████████████████▊                            | 1183/1751 [19:47:35<9:35:15, 60.77s/it] 68%|██████████████████████████████████████████████████████████▊                            | 1184/1751 [19:48:36<9:36:21, 60.99s/it]                                                                                                                                     {'loss': '0.5247', 'grad_norm': '0.1777', 'learning_rate': '5.22e-06', 'ppl': '1.69', 'memory/max_active (GiB)': '75.82', 'memory/max_allocated (GiB)': '75.82', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '47.19', 'tokens/total': 2246127872, 'tokens/trainable': 830480256, 'epoch': '2.027'}
 68%|██████████████████████████████████████████████████████████▊                            | 1184/1751 [19:48:36<9:36:21, 60.99s/it] 68%|██████████████████████████████████████████████████████████▉                            | 1185/1751 [19:49:35<9:29:42, 60.39s/it]                                                                                                                                     {'loss': '0.5251', 'grad_norm': '0.1797', 'learning_rate': '5.203e-06', 'ppl': '1.691', 'memory/max_active (GiB)': '71.42', 'memory/max_allocated (GiB)': '71.42', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '39.69', 'tokens/total': 2247966976, 'tokens/trainable': 831160512, 'epoch': '2.029'}
 68%|██████████████████████████████████████████████████████████▉                            | 1185/1751 [19:49:35<9:29:42, 60.39s/it] 68%|██████████████████████████████████████████████████████████▉                            | 1186/1751 [19:50:36<9:29:49, 60.51s/it]                                                                                                                                     {'loss': '0.4975', 'grad_norm': '0.1641', 'learning_rate': '5.186e-06', 'ppl': '1.645', 'memory/max_active (GiB)': '75.63', 'memory/max_allocated (GiB)': '75.63', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '66.63', 'tokens/total': 2249891584, 'tokens/trainable': 831902336, 'epoch': '2.031'}
 68%|██████████████████████████████████████████████████████████▉                            | 1186/1751 [19:50:36<9:29:49, 60.51s/it] 68%|██████████████████████████████████████████████████████████▉                            | 1187/1751 [19:51:37<9:29:10, 60.55s/it]                                                                                                                                     {'loss': '0.5196', 'grad_norm': '0.1826', 'learning_rate': '5.17e-06', 'ppl': '1.681', 'memory/max_active (GiB)': '78.23', 'memory/max_allocated (GiB)': '78.23', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '135', 'tokens/total': 2251747328, 'tokens/trainable': 832603136, 'epoch': '2.033'}
 68%|██████████████████████████████████████████████████████████▉                            | 1187/1751 [19:51:37<9:29:10, 60.55s/it] 68%|███████████████████████████████████████████████████████████                            | 1188/1751 [19:52:37<9:27:37, 60.49s/it]                                                                                                                                     {'loss': '0.5411', 'grad_norm': '0.1865', 'learning_rate': '5.153e-06', 'ppl': '1.718', 'memory/max_active (GiB)': '68.62', 'memory/max_allocated (GiB)': '68.62', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '53.93', 'tokens/total': 2253632256, 'tokens/trainable': 833336832, 'epoch': '2.034'}
 68%|███████████████████████████████████████████████████████████                            | 1188/1751 [19:52:37<9:27:37, 60.49s/it] 68%|███████████████████████████████████████████████████████████                            | 1189/1751 [19:53:37<9:26:11, 60.45s/it]                                                                                                                                     {'loss': '0.5064', 'grad_norm': '0.1719', 'learning_rate': '5.137e-06', 'ppl': '1.659', 'memory/max_active (GiB)': '74.83', 'memory/max_allocated (GiB)': '74.83', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '40.08', 'tokens/total': 2255448320, 'tokens/trainable': 834047808, 'epoch': '2.036'}
 68%|███████████████████████████████████████████████████████████                            | 1189/1751 [19:53:37<9:26:11, 60.45s/it] 68%|███████████████████████████████████████████████████████████▏                           | 1190/1751 [19:54:36<9:20:44, 59.97s/it]                                                                                                                                     {'loss': '0.5298', 'grad_norm': '0.1787', 'learning_rate': '5.12e-06', 'ppl': '1.699', 'memory/max_active (GiB)': '71.2', 'memory/max_allocated (GiB)': '71.2', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '37.34', 'tokens/total': 2257291264, 'tokens/trainable': 834721600, 'epoch': '2.038'}
 68%|███████████████████████████████████████████████████████████▏                           | 1190/1751 [19:54:36<9:20:44, 59.97s/it] 68%|███████████████████████████████████████████████████████████▏                           | 1191/1751 [19:55:34<9:14:21, 59.40s/it]                                                                                                                                     {'loss': '0.553', 'grad_norm': '0.1904', 'learning_rate': '5.104e-06', 'ppl': '1.738', 'memory/max_active (GiB)': '68.64', 'memory/max_allocated (GiB)': '68.64', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '86.49', 'tokens/total': 2259105792, 'tokens/trainable': 835395520, 'epoch': '2.039'}
 68%|███████████████████████████████████████████████████████████▏                           | 1191/1751 [19:55:34<9:14:21, 59.40s/it] 68%|███████████████████████████████████████████████████████████▏                           | 1192/1751 [19:56:31<9:05:50, 58.59s/it]                                                                                                                                     {'loss': '0.5509', 'grad_norm': '0.1943', 'learning_rate': '5.087e-06', 'ppl': '1.735', 'memory/max_active (GiB)': '75.1', 'memory/max_allocated (GiB)': '75.1', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '81.96', 'tokens/total': 2260882176, 'tokens/trainable': 836050368, 'epoch': '2.041'}
 68%|███████████████████████████████████████████████████████████▏                           | 1192/1751 [19:56:31<9:05:50, 58.59s/it] 68%|███████████████████████████████████████████████████████████▎                           | 1193/1751 [19:57:30<9:05:15, 58.63s/it]                                                                                                                                     {'loss': '0.532', 'grad_norm': '0.1885', 'learning_rate': '5.071e-06', 'ppl': '1.702', 'memory/max_active (GiB)': '72.79', 'memory/max_allocated (GiB)': '72.79', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '99.39', 'tokens/total': 2262729728, 'tokens/trainable': 836735360, 'epoch': '2.043'}
 68%|███████████████████████████████████████████████████████████▎                           | 1193/1751 [19:57:30<9:05:15, 58.63s/it] 68%|███████████████████████████████████████████████████████████▎                           | 1194/1751 [19:58:28<9:04:28, 58.65s/it]                                                                                                                                     {'loss': '0.5208', 'grad_norm': '0.1738', 'learning_rate': '5.055e-06', 'ppl': '1.683', 'memory/max_active (GiB)': '76.06', 'memory/max_allocated (GiB)': '76.06', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '52.6', 'tokens/total': 2264576256, 'tokens/trainable': 837422656, 'epoch': '2.045'}
 68%|███████████████████████████████████████████████████████████▎                           | 1194/1751 [19:58:28<9:04:28, 58.65s/it] 68%|███████████████████████████████████████████████████████████▎                           | 1195/1751 [19:59:27<9:02:10, 58.51s/it]                                                                                                                                     {'loss': '0.5644', 'grad_norm': '0.1934', 'learning_rate': '5.038e-06', 'ppl': '1.758', 'memory/max_active (GiB)': '75.94', 'memory/max_allocated (GiB)': '75.94', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '56.78', 'tokens/total': 2266450176, 'tokens/trainable': 838102848, 'epoch': '2.046'}
 68%|███████████████████████████████████████████████████████████▎                           | 1195/1751 [19:59:27<9:02:10, 58.51s/it] 68%|███████████████████████████████████████████████████████████▍                           | 1196/1751 [20:00:26<9:04:08, 58.83s/it]                                                                                                                                     {'loss': '0.4833', 'grad_norm': '0.1787', 'learning_rate': '5.022e-06', 'ppl': '1.621', 'memory/max_active (GiB)': '77.15', 'memory/max_allocated (GiB)': '77.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '152.2', 'tokens/total': 2268350976, 'tokens/trainable': 838788992, 'epoch': '2.048'}
 68%|███████████████████████████████████████████████████████████▍                           | 1196/1751 [20:00:26<9:04:08, 58.83s/it] 68%|███████████████████████████████████████████████████████████▍                           | 1197/1751 [20:01:26<9:06:02, 59.14s/it]                                                                                                                                     {'loss': '0.5125', 'grad_norm': '0.1826', 'learning_rate': '5.005e-06', 'ppl': '1.67', 'memory/max_active (GiB)': '72.19', 'memory/max_allocated (GiB)': '72.19', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '39.65', 'tokens/total': 2270255872, 'tokens/trainable': 839492544, 'epoch': '2.05'}
 68%|███████████████████████████████████████████████████████████▍                           | 1197/1751 [20:01:26<9:06:02, 59.14s/it] 68%|███████████████████████████████████████████████████████████▌                           | 1198/1751 [20:02:27<9:09:58, 59.67s/it]                                                                                                                                     {'loss': '0.5056', 'grad_norm': '0.1709', 'learning_rate': '4.989e-06', 'ppl': '1.658', 'memory/max_active (GiB)': '76.64', 'memory/max_allocated (GiB)': '76.64', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '113.1', 'tokens/total': 2272197632, 'tokens/trainable': 840213440, 'epoch': '2.051'}
 68%|███████████████████████████████████████████████████████████▌                           | 1198/1751 [20:02:27<9:09:58, 59.67s/it] 68%|███████████████████████████████████████████████████████████▌                           | 1199/1751 [20:03:29<9:14:14, 60.24s/it]                                                                                                                                     {'loss': '0.523', 'grad_norm': '0.1865', 'learning_rate': '4.973e-06', 'ppl': '1.687', 'memory/max_active (GiB)': '73.43', 'memory/max_allocated (GiB)': '73.43', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '79.09', 'tokens/total': 2274115840, 'tokens/trainable': 840895168, 'epoch': '2.053'}
 68%|███████████████████████████████████████████████████████████▌                           | 1199/1751 [20:03:29<9:14:14, 60.24s/it] 69%|███████████████████████████████████████████████████████████▌                           | 1200/1751 [20:04:27<9:09:24, 59.83s/it]                                                                                                                                     {'loss': '0.5563', 'grad_norm': '0.1904', 'learning_rate': '4.956e-06', 'ppl': '1.744', 'memory/max_active (GiB)': '72.96', 'memory/max_allocated (GiB)': '72.96', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '69.09', 'tokens/total': 2275957248, 'tokens/trainable': 841557184, 'epoch': '2.055'}
 69%|███████████████████████████████████████████████████████████▌                           | 1200/1751 [20:04:27<9:09:24, 59.83s/it] 69%|███████████████████████████████████████████████████████████▋                           | 1201/1751 [20:05:27<9:07:46, 59.76s/it]                                                                                                                                     {'loss': '0.5279', 'grad_norm': '0.1797', 'learning_rate': '4.94e-06', 'ppl': '1.695', 'memory/max_active (GiB)': '73.71', 'memory/max_allocated (GiB)': '73.71', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '7.843', 'tokens/total': 2277815040, 'tokens/trainable': 842264768, 'epoch': '2.057'}
 69%|███████████████████████████████████████████████████████████▋                           | 1201/1751 [20:05:27<9:07:46, 59.76s/it] 69%|███████████████████████████████████████████████████████████▋                           | 1202/1751 [20:06:26<9:03:47, 59.43s/it]                                                                                                                                     {'loss': '0.5366', 'grad_norm': '0.1865', 'learning_rate': '4.924e-06', 'ppl': '1.71', 'memory/max_active (GiB)': '72.74', 'memory/max_allocated (GiB)': '72.74', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '94.03', 'tokens/total': 2279665152, 'tokens/trainable': 842936256, 'epoch': '2.058'}
 69%|███████████████████████████████████████████████████████████▋                           | 1202/1751 [20:06:26<9:03:47, 59.43s/it] 69%|███████████████████████████████████████████████████████████▊                           | 1203/1751 [20:07:26<9:04:38, 59.63s/it]                                                                                                                                     {'loss': '0.4785', 'grad_norm': '0.1816', 'learning_rate': '4.908e-06', 'ppl': '1.614', 'memory/max_active (GiB)': '72.68', 'memory/max_allocated (GiB)': '72.68', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '63.87', 'tokens/total': 2281580800, 'tokens/trainable': 843633024, 'epoch': '2.06'}
 69%|███████████████████████████████████████████████████████████▊                           | 1203/1751 [20:07:26<9:04:38, 59.63s/it] 69%|███████████████████████████████████████████████████████████▊                           | 1204/1751 [20:08:26<9:04:05, 59.68s/it]                                                                                                                                     {'loss': '0.5192', 'grad_norm': '0.1787', 'learning_rate': '4.891e-06', 'ppl': '1.681', 'memory/max_active (GiB)': '73.88', 'memory/max_allocated (GiB)': '73.88', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '108.1', 'tokens/total': 2283477760, 'tokens/trainable': 844339264, 'epoch': '2.062'}
 69%|███████████████████████████████████████████████████████████▊                           | 1204/1751 [20:08:26<9:04:05, 59.68s/it] 69%|███████████████████████████████████████████████████████████▊                           | 1205/1751 [20:09:25<9:01:15, 59.48s/it]                                                                                                                                     {'loss': '0.5126', 'grad_norm': '0.1826', 'learning_rate': '4.875e-06', 'ppl': '1.67', 'memory/max_active (GiB)': '76.79', 'memory/max_allocated (GiB)': '76.79', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '126.8', 'tokens/total': 2285311488, 'tokens/trainable': 845046336, 'epoch': '2.063'}
 69%|███████████████████████████████████████████████████████████▊                           | 1205/1751 [20:09:25<9:01:15, 59.48s/it] 69%|███████████████████████████████████████████████████████████▉                           | 1206/1751 [20:10:24<8:59:29, 59.39s/it]                                                                                                                                     {'loss': '0.5213', 'grad_norm': '0.1738', 'learning_rate': '4.859e-06', 'ppl': '1.684', 'memory/max_active (GiB)': '66.61', 'memory/max_allocated (GiB)': '66.61', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '190.7', 'tokens/total': 2287203328, 'tokens/trainable': 845742464, 'epoch': '2.065'}
 69%|███████████████████████████████████████████████████████████▉                           | 1206/1751 [20:10:24<8:59:29, 59.39s/it] 69%|███████████████████████████████████████████████████████████▉                           | 1207/1751 [20:11:23<8:57:17, 59.26s/it]                                                                                                                                     {'loss': '0.5143', 'grad_norm': '0.1875', 'learning_rate': '4.843e-06', 'ppl': '1.673', 'memory/max_active (GiB)': '74.49', 'memory/max_allocated (GiB)': '74.49', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '8.521', 'tokens/total': 2289064704, 'tokens/trainable': 846406272, 'epoch': '2.067'}
 69%|███████████████████████████████████████████████████████████▉                           | 1207/1751 [20:11:23<8:57:17, 59.26s/it] 69%|████████████████████████████████████████████████████████████                           | 1208/1751 [20:12:24<9:02:41, 59.97s/it]                                                                                                                                     {'loss': '0.5453', 'grad_norm': '0.1758', 'learning_rate': '4.827e-06', 'ppl': '1.725', 'memory/max_active (GiB)': '71.32', 'memory/max_allocated (GiB)': '71.32', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '80.27', 'tokens/total': 2291038976, 'tokens/trainable': 847137472, 'epoch': '2.069'}
 69%|████████████████████████████████████████████████████████████                           | 1208/1751 [20:12:24<9:02:41, 59.97s/it] 69%|████████████████████████████████████████████████████████████                           | 1209/1751 [20:13:24<9:01:54, 59.99s/it]                                                                                                                                     {'loss': '0.5307', 'grad_norm': '0.1816', 'learning_rate': '4.81e-06', 'ppl': '1.7', 'memory/max_active (GiB)': '74.59', 'memory/max_allocated (GiB)': '74.59', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '149.2', 'tokens/total': 2292964864, 'tokens/trainable': 847865152, 'epoch': '2.07'}
 69%|████████████████████████████████████████████████████████████                           | 1209/1751 [20:13:24<9:01:54, 59.99s/it] 69%|████████████████████████████████████████████████████████████                           | 1210/1751 [20:14:26<9:05:20, 60.48s/it]                                                                                                                                     {'loss': '0.5152', 'grad_norm': '0.1904', 'learning_rate': '4.794e-06', 'ppl': '1.674', 'memory/max_active (GiB)': '74.72', 'memory/max_allocated (GiB)': '74.72', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '94.39', 'tokens/total': 2294873088, 'tokens/trainable': 848587904, 'epoch': '2.072'}
 69%|████████████████████████████████████████████████████████████                           | 1210/1751 [20:14:26<9:05:20, 60.48s/it] 69%|████████████████████████████████████████████████████████████▏                          | 1211/1751 [20:15:26<9:03:17, 60.37s/it]                                                                                                                                     {'loss': '0.5137', 'grad_norm': '0.1816', 'learning_rate': '4.778e-06', 'ppl': '1.671', 'memory/max_active (GiB)': '75.1', 'memory/max_allocated (GiB)': '75.1', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '117.9', 'tokens/total': 2296813056, 'tokens/trainable': 849304704, 'epoch': '2.074'}
 69%|████████████████████████████████████████████████████████████▏                          | 1211/1751 [20:15:26<9:03:17, 60.37s/it] 69%|████████████████████████████████████████████████████████████▏                          | 1212/1751 [20:16:25<8:57:38, 59.85s/it]                                                                                                                                     {'loss': '0.5318', 'grad_norm': '0.1865', 'learning_rate': '4.762e-06', 'ppl': '1.702', 'memory/max_active (GiB)': '71.3', 'memory/max_allocated (GiB)': '71.3', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '15.1', 'tokens/total': 2298605056, 'tokens/trainable': 849978304, 'epoch': '2.075'}
 69%|████████████████████████████████████████████████████████████▏                          | 1212/1751 [20:16:25<8:57:38, 59.85s/it] 69%|████████████████████████████████████████████████████████████▎                          | 1213/1751 [20:17:27<9:02:38, 60.52s/it]                                                                                                                                     {'loss': '0.5202', 'grad_norm': '0.1777', 'learning_rate': '4.746e-06', 'ppl': '1.682', 'memory/max_active (GiB)': '73.5', 'memory/max_allocated (GiB)': '73.5', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '145.6', 'tokens/total': 2300587264, 'tokens/trainable': 850713472, 'epoch': '2.077'}
 69%|████████████████████████████████████████████████████████████▎                          | 1213/1751 [20:17:27<9:02:38, 60.52s/it] 69%|████████████████████████████████████████████████████████████▎                          | 1214/1751 [20:18:29<9:05:11, 60.92s/it]                                                                                                                                     {'loss': '0.4831', 'grad_norm': '0.1611', 'learning_rate': '4.73e-06', 'ppl': '1.621', 'memory/max_active (GiB)': '75.15', 'memory/max_allocated (GiB)': '75.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '65.67', 'tokens/total': 2302623744, 'tokens/trainable': 851466816, 'epoch': '2.079'}
 69%|████████████████████████████████████████████████████████████▎                          | 1214/1751 [20:18:29<9:05:11, 60.92s/it] 69%|████████████████████████████████████████████████████████████▎                          | 1215/1751 [20:19:28<8:59:53, 60.43s/it]                                                                                                                                     {'loss': '0.5014', 'grad_norm': '0.1943', 'learning_rate': '4.714e-06', 'ppl': '1.651', 'memory/max_active (GiB)': '72.01', 'memory/max_allocated (GiB)': '72.01', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '82.38', 'tokens/total': 2304495616, 'tokens/trainable': 852124480, 'epoch': '2.081'}
 69%|████████████████████████████████████████████████████████████▎                          | 1215/1751 [20:19:28<8:59:53, 60.43s/it] 69%|████████████████████████████████████████████████████████████▍                          | 1216/1751 [20:20:29<9:01:40, 60.75s/it]                                                                                                                                     {'loss': '0.5265', 'grad_norm': '0.1777', 'learning_rate': '4.698e-06', 'ppl': '1.693', 'memory/max_active (GiB)': '71.81', 'memory/max_allocated (GiB)': '71.81', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '83.77', 'tokens/total': 2306433536, 'tokens/trainable': 852878272, 'epoch': '2.082'}
 69%|████████████████████████████████████████████████████████████▍                          | 1216/1751 [20:20:29<9:01:40, 60.75s/it] 70%|████████████████████████████████████████████████████████████▍                          | 1217/1751 [20:21:27<8:53:32, 59.95s/it]                                                                                                                                     {'loss': '0.5037', 'grad_norm': '0.1699', 'learning_rate': '4.682e-06', 'ppl': '1.655', 'memory/max_active (GiB)': '72.02', 'memory/max_allocated (GiB)': '72.02', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '79.03', 'tokens/total': 2308279552, 'tokens/trainable': 853552576, 'epoch': '2.084'}
 70%|████████████████████████████████████████████████████████████▍                          | 1217/1751 [20:21:28<8:53:32, 59.95s/it] 70%|████████████████████████████████████████████████████████████▌                          | 1218/1751 [20:22:25<8:44:48, 59.08s/it]                                                                                                                                     {'loss': '0.5559', 'grad_norm': '0.1895', 'learning_rate': '4.666e-06', 'ppl': '1.743', 'memory/max_active (GiB)': '72.64', 'memory/max_allocated (GiB)': '72.64', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '42.77', 'tokens/total': 2310082048, 'tokens/trainable': 854197504, 'epoch': '2.086'}
 70%|████████████████████████████████████████████████████████████▌                          | 1218/1751 [20:22:25<8:44:48, 59.08s/it] 70%|████████████████████████████████████████████████████████████▌                          | 1219/1751 [20:23:23<8:41:12, 58.78s/it]                                                                                                                                     {'loss': '0.5333', 'grad_norm': '0.1943', 'learning_rate': '4.65e-06', 'ppl': '1.705', 'memory/max_active (GiB)': '74.39', 'memory/max_allocated (GiB)': '74.39', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '151.5', 'tokens/total': 2311908352, 'tokens/trainable': 854868224, 'epoch': '2.087'}
 70%|████████████████████████████████████████████████████████████▌                          | 1219/1751 [20:23:23<8:41:12, 58.78s/it] 70%|████████████████████████████████████████████████████████████▌                          | 1220/1751 [20:24:23<8:43:43, 59.18s/it]                                                                                                                                     {'loss': '0.5073', 'grad_norm': '0.1777', 'learning_rate': '4.634e-06', 'ppl': '1.661', 'memory/max_active (GiB)': '76.28', 'memory/max_allocated (GiB)': '76.28', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '131.2', 'tokens/total': 2313797632, 'tokens/trainable': 855606976, 'epoch': '2.089'}
 70%|████████████████████████████████████████████████████████████▌                          | 1220/1751 [20:24:23<8:43:43, 59.18s/it] 70%|████████████████████████████████████████████████████████████▋                          | 1221/1751 [20:25:22<8:42:11, 59.12s/it]                                                                                                                                     {'loss': '0.5066', 'grad_norm': '0.1738', 'learning_rate': '4.618e-06', 'ppl': '1.66', 'memory/max_active (GiB)': '73.71', 'memory/max_allocated (GiB)': '73.71', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '72.48', 'tokens/total': 2315606784, 'tokens/trainable': 856290688, 'epoch': '2.091'}
 70%|████████████████████████████████████████████████████████████▋                          | 1221/1751 [20:25:22<8:42:11, 59.12s/it] 70%|████████████████████████████████████████████████████████████▋                          | 1222/1751 [20:26:22<8:43:46, 59.41s/it]                                                                                                                                     {'loss': '0.5265', 'grad_norm': '0.1875', 'learning_rate': '4.602e-06', 'ppl': '1.693', 'memory/max_active (GiB)': '73.13', 'memory/max_allocated (GiB)': '73.13', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '81.44', 'tokens/total': 2317485824, 'tokens/trainable': 856979648, 'epoch': '2.093'}
 70%|████████████████████████████████████████████████████████████▋                          | 1222/1751 [20:26:22<8:43:46, 59.41s/it] 70%|████████████████████████████████████████████████████████████▊                          | 1223/1751 [20:27:23<8:48:50, 60.10s/it]                                                                                                                                     {'loss': '0.4913', 'grad_norm': '0.1797', 'learning_rate': '4.586e-06', 'ppl': '1.634', 'memory/max_active (GiB)': '77.11', 'memory/max_allocated (GiB)': '77.11', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '94.77', 'tokens/total': 2319458048, 'tokens/trainable': 857698688, 'epoch': '2.094'}
 70%|████████████████████████████████████████████████████████████▊                          | 1223/1751 [20:27:24<8:48:50, 60.10s/it] 70%|████████████████████████████████████████████████████████████▊                          | 1224/1751 [20:28:24<8:47:56, 60.11s/it]                                                                                                                                     {'loss': '0.5252', 'grad_norm': '0.1797', 'learning_rate': '4.571e-06', 'ppl': '1.691', 'memory/max_active (GiB)': '74.7', 'memory/max_allocated (GiB)': '74.7', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '64.14', 'tokens/total': 2321370112, 'tokens/trainable': 858421312, 'epoch': '2.096'}
 70%|████████████████████████████████████████████████████████████▊                          | 1224/1751 [20:28:24<8:47:56, 60.11s/it] 70%|████████████████████████████████████████████████████████████▊                          | 1225/1751 [20:29:25<8:49:11, 60.37s/it]                                                                                                                                     {'loss': '0.4884', 'grad_norm': '0.1855', 'learning_rate': '4.555e-06', 'ppl': '1.63', 'memory/max_active (GiB)': '73.79', 'memory/max_allocated (GiB)': '73.79', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '175.8', 'tokens/total': 2323317760, 'tokens/trainable': 859115584, 'epoch': '2.098'}
 70%|████████████████████████████████████████████████████████████▊                          | 1225/1751 [20:29:25<8:49:11, 60.37s/it] 70%|████████████████████████████████████████████████████████████▉                          | 1226/1751 [20:30:27<8:54:17, 61.06s/it]                                                                                                                                     {'loss': '0.4686', 'grad_norm': '0.1738', 'learning_rate': '4.539e-06', 'ppl': '1.598', 'memory/max_active (GiB)': '74.69', 'memory/max_allocated (GiB)': '74.69', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '115.3', 'tokens/total': 2325362432, 'tokens/trainable': 859854272, 'epoch': '2.099'}
 70%|████████████████████████████████████████████████████████████▉                          | 1226/1751 [20:30:27<8:54:17, 61.06s/it] 70%|████████████████████████████████████████████████████████████▉                          | 1227/1751 [20:31:28<8:51:10, 60.82s/it]                                                                                                                                     {'loss': '0.5212', 'grad_norm': '0.1797', 'learning_rate': '4.523e-06', 'ppl': '1.684', 'memory/max_active (GiB)': '75.29', 'memory/max_allocated (GiB)': '75.29', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '55.74', 'tokens/total': 2327324160, 'tokens/trainable': 860575104, 'epoch': '2.101'}
 70%|████████████████████████████████████████████████████████████▉                          | 1227/1751 [20:31:28<8:51:10, 60.82s/it] 70%|█████████████████████████████████████████████████████████████                          | 1228/1751 [20:32:27<8:46:45, 60.43s/it]                                                                                                                                     {'loss': '0.5619', 'grad_norm': '0.2031', 'learning_rate': '4.507e-06', 'ppl': '1.754', 'memory/max_active (GiB)': '72.14', 'memory/max_allocated (GiB)': '72.14', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '61.4', 'tokens/total': 2329238016, 'tokens/trainable': 861284736, 'epoch': '2.103'}
 70%|█████████████████████████████████████████████████████████████                          | 1228/1751 [20:32:27<8:46:45, 60.43s/it] 70%|█████████████████████████████████████████████████████████████                          | 1229/1751 [20:33:28<8:46:18, 60.50s/it]                                                                                                                                     {'loss': '0.4754', 'grad_norm': '0.1787', 'learning_rate': '4.491e-06', 'ppl': '1.609', 'memory/max_active (GiB)': '75.66', 'memory/max_allocated (GiB)': '75.66', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '88.06', 'tokens/total': 2331172096, 'tokens/trainable': 862004096, 'epoch': '2.105'}
 70%|█████████████████████████████████████████████████████████████                          | 1229/1751 [20:33:28<8:46:18, 60.50s/it] 70%|█████████████████████████████████████████████████████████████                          | 1230/1751 [20:34:24<8:35:29, 59.37s/it]                                                                                                                                     {'loss': '0.5328', 'grad_norm': '0.1826', 'learning_rate': '4.476e-06', 'ppl': '1.704', 'memory/max_active (GiB)': '73.08', 'memory/max_allocated (GiB)': '73.08', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '85.65', 'tokens/total': 2332934912, 'tokens/trainable': 862661568, 'epoch': '2.106'}
 70%|█████████████████████████████████████████████████████████████                          | 1230/1751 [20:34:24<8:35:29, 59.37s/it] 70%|█████████████████████████████████████████████████████████████▏                         | 1231/1751 [20:35:23<8:32:02, 59.08s/it]                                                                                                                                     {'loss': '0.5369', 'grad_norm': '0.1846', 'learning_rate': '4.46e-06', 'ppl': '1.711', 'memory/max_active (GiB)': '72.66', 'memory/max_allocated (GiB)': '72.66', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '145', 'tokens/total': 2334835456, 'tokens/trainable': 863374976, 'epoch': '2.108'}
 70%|█████████████████████████████████████████████████████████████▏                         | 1231/1751 [20:35:23<8:32:02, 59.08s/it] 70%|█████████████████████████████████████████████████████████████▏                         | 1232/1751 [20:36:24<8:37:16, 59.80s/it]                                                                                                                                     {'loss': '0.5021', 'grad_norm': '0.1719', 'learning_rate': '4.444e-06', 'ppl': '1.652', 'memory/max_active (GiB)': '76.83', 'memory/max_allocated (GiB)': '76.83', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '101.2', 'tokens/total': 2336830720, 'tokens/trainable': 864092672, 'epoch': '2.11'}
 70%|█████████████████████████████████████████████████████████████▏                         | 1232/1751 [20:36:24<8:37:16, 59.80s/it] 70%|█████████████████████████████████████████████████████████████▎                         | 1233/1751 [20:37:24<8:35:24, 59.70s/it]                                                                                                                                     {'loss': '0.5287', 'grad_norm': '0.1846', 'learning_rate': '4.429e-06', 'ppl': '1.697', 'memory/max_active (GiB)': '69.87', 'memory/max_allocated (GiB)': '69.87', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '116.6', 'tokens/total': 2338714624, 'tokens/trainable': 864760576, 'epoch': '2.111'}
 70%|█████████████████████████████████████████████████████████████▎                         | 1233/1751 [20:37:24<8:35:24, 59.70s/it] 70%|█████████████████████████████████████████████████████████████▎                         | 1234/1751 [20:38:23<8:33:02, 59.54s/it]                                                                                                                                     {'loss': '0.49', 'grad_norm': '0.1709', 'learning_rate': '4.413e-06', 'ppl': '1.632', 'memory/max_active (GiB)': '73.47', 'memory/max_allocated (GiB)': '73.47', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '97.63', 'tokens/total': 2340570624, 'tokens/trainable': 865446784, 'epoch': '2.113'}
 70%|█████████████████████████████████████████████████████████████▎                         | 1234/1751 [20:38:23<8:33:02, 59.54s/it] 71%|█████████████████████████████████████████████████████████████▎                         | 1235/1751 [20:39:24<8:35:44, 59.97s/it]                                                                                                                                     {'loss': '0.5062', 'grad_norm': '0.1777', 'learning_rate': '4.397e-06', 'ppl': '1.659', 'memory/max_active (GiB)': '75.24', 'memory/max_allocated (GiB)': '75.24', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '105.3', 'tokens/total': 2342553600, 'tokens/trainable': 866189184, 'epoch': '2.115'}
 71%|█████████████████████████████████████████████████████████████▎                         | 1235/1751 [20:39:24<8:35:44, 59.97s/it] 71%|█████████████████████████████████████████████████████████████▍                         | 1236/1751 [20:40:21<8:28:03, 59.19s/it]                                                                                                                                     {'loss': '0.543', 'grad_norm': '0.1904', 'learning_rate': '4.382e-06', 'ppl': '1.721', 'memory/max_active (GiB)': '64.92', 'memory/max_allocated (GiB)': '64.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '149.4', 'tokens/total': 2344402176, 'tokens/trainable': 866878080, 'epoch': '2.117'}
 71%|█████████████████████████████████████████████████████████████▍                         | 1236/1751 [20:40:21<8:28:03, 59.19s/it] 71%|█████████████████████████████████████████████████████████████▍                         | 1237/1751 [20:41:24<8:36:35, 60.30s/it]                                                                                                                                     {'loss': '0.4922', 'grad_norm': '0.1729', 'learning_rate': '4.366e-06', 'ppl': '1.636', 'memory/max_active (GiB)': '75.57', 'memory/max_allocated (GiB)': '75.57', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '66.15', 'tokens/total': 2346465536, 'tokens/trainable': 867627520, 'epoch': '2.118'}
 71%|█████████████████████████████████████████████████████████████▍                         | 1237/1751 [20:41:24<8:36:35, 60.30s/it] 71%|█████████████████████████████████████████████████████████████▌                         | 1238/1751 [20:42:22<8:29:40, 59.61s/it]                                                                                                                                     {'loss': '0.5172', 'grad_norm': '0.1729', 'learning_rate': '4.35e-06', 'ppl': '1.677', 'memory/max_active (GiB)': '70.44', 'memory/max_allocated (GiB)': '70.44', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '75.53', 'tokens/total': 2348309504, 'tokens/trainable': 868303744, 'epoch': '2.12'}
 71%|█████████████████████████████████████████████████████████████▌                         | 1238/1751 [20:42:22<8:29:40, 59.61s/it] 71%|█████████████████████████████████████████████████████████████▌                         | 1239/1751 [20:43:22<8:28:57, 59.64s/it]                                                                                                                                     {'loss': '0.5293', 'grad_norm': '0.1826', 'learning_rate': '4.335e-06', 'ppl': '1.698', 'memory/max_active (GiB)': '72.2', 'memory/max_allocated (GiB)': '72.2', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '104', 'tokens/total': 2350214656, 'tokens/trainable': 869005568, 'epoch': '2.122'}
 71%|█████████████████████████████████████████████████████████████▌                         | 1239/1751 [20:43:22<8:28:57, 59.64s/it] 71%|█████████████████████████████████████████████████████████████▌                         | 1240/1751 [20:44:23<8:30:32, 59.95s/it]                                                                                                                                     {'loss': '0.5186', 'grad_norm': '0.1807', 'learning_rate': '4.319e-06', 'ppl': '1.68', 'memory/max_active (GiB)': '69.27', 'memory/max_allocated (GiB)': '69.27', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '112.8', 'tokens/total': 2352162560, 'tokens/trainable': 869729728, 'epoch': '2.123'}
 71%|█████████████████████████████████████████████████████████████▌                         | 1240/1751 [20:44:23<8:30:32, 59.95s/it] 71%|█████████████████████████████████████████████████████████████▋                         | 1241/1751 [20:45:21<8:25:19, 59.45s/it]                                                                                                                                     {'loss': '0.5193', 'grad_norm': '0.1719', 'learning_rate': '4.304e-06', 'ppl': '1.681', 'memory/max_active (GiB)': '71.59', 'memory/max_allocated (GiB)': '71.59', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '59.2', 'tokens/total': 2354060800, 'tokens/trainable': 870436608, 'epoch': '2.125'}
 71%|█████████████████████████████████████████████████████████████▋                         | 1241/1751 [20:45:21<8:25:19, 59.45s/it] 71%|█████████████████████████████████████████████████████████████▋                         | 1242/1751 [20:46:18<8:18:06, 58.72s/it]                                                                                                                                     {'loss': '0.5209', 'grad_norm': '0.1895', 'learning_rate': '4.288e-06', 'ppl': '1.683', 'memory/max_active (GiB)': '73.7', 'memory/max_allocated (GiB)': '73.7', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '40.41', 'tokens/total': 2355852800, 'tokens/trainable': 871106432, 'epoch': '2.127'}
 71%|█████████████████████████████████████████████████████████████▋                         | 1242/1751 [20:46:18<8:18:06, 58.72s/it] 71%|█████████████████████████████████████████████████████████████▊                         | 1243/1751 [20:47:19<8:24:13, 59.55s/it]                                                                                                                                     {'loss': '0.5213', 'grad_norm': '0.1758', 'learning_rate': '4.273e-06', 'ppl': '1.684', 'memory/max_active (GiB)': '73.96', 'memory/max_allocated (GiB)': '73.96', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '34.26', 'tokens/total': 2357807104, 'tokens/trainable': 871820992, 'epoch': '2.129'}
 71%|█████████████████████████████████████████████████████████████▊                         | 1243/1751 [20:47:19<8:24:13, 59.55s/it] 71%|█████████████████████████████████████████████████████████████▊                         | 1244/1751 [20:48:18<8:20:44, 59.26s/it]                                                                                                                                     {'loss': '0.542', 'grad_norm': '0.1895', 'learning_rate': '4.257e-06', 'ppl': '1.719', 'memory/max_active (GiB)': '72.87', 'memory/max_allocated (GiB)': '72.87', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '70.23', 'tokens/total': 2359672064, 'tokens/trainable': 872491264, 'epoch': '2.13'}
 71%|█████████████████████████████████████████████████████████████▊                         | 1244/1751 [20:48:18<8:20:44, 59.26s/it] 71%|█████████████████████████████████████████████████████████████▊                         | 1245/1751 [20:49:19<8:24:53, 59.87s/it]                                                                                                                                     {'loss': '0.5085', 'grad_norm': '0.1748', 'learning_rate': '4.242e-06', 'ppl': '1.663', 'memory/max_active (GiB)': '75.94', 'memory/max_allocated (GiB)': '75.94', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '102', 'tokens/total': 2361615872, 'tokens/trainable': 873236992, 'epoch': '2.132'}
 71%|█████████████████████████████████████████████████████████████▊                         | 1245/1751 [20:49:19<8:24:53, 59.87s/it] 71%|█████████████████████████████████████████████████████████████▉                         | 1246/1751 [20:50:20<8:25:32, 60.06s/it]                                                                                                                                     {'loss': '0.5058', 'grad_norm': '0.1748', 'learning_rate': '4.226e-06', 'ppl': '1.658', 'memory/max_active (GiB)': '75.83', 'memory/max_allocated (GiB)': '75.83', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '61.98', 'tokens/total': 2363557376, 'tokens/trainable': 873983168, 'epoch': '2.134'}
 71%|█████████████████████████████████████████████████████████████▉                         | 1246/1751 [20:50:20<8:25:32, 60.06s/it] 71%|█████████████████████████████████████████████████████████████▉                         | 1247/1751 [20:51:18<8:21:00, 59.64s/it]                                                                                                                                     {'loss': '0.543', 'grad_norm': '0.1943', 'learning_rate': '4.211e-06', 'ppl': '1.721', 'memory/max_active (GiB)': '71.7', 'memory/max_allocated (GiB)': '71.7', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '138.2', 'tokens/total': 2365392128, 'tokens/trainable': 874658112, 'epoch': '2.135'}
 71%|█████████████████████████████████████████████████████████████▉                         | 1247/1751 [20:51:18<8:21:00, 59.64s/it] 71%|██████████████████████████████████████████████████████████████                         | 1248/1751 [20:52:14<8:09:34, 58.40s/it]                                                                                                                                     {'loss': '0.5244', 'grad_norm': '0.2148', 'learning_rate': '4.196e-06', 'ppl': '1.689', 'memory/max_active (GiB)': '77.01', 'memory/max_allocated (GiB)': '77.01', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '109.6', 'tokens/total': 2367116032, 'tokens/trainable': 875280640, 'epoch': '2.137'}
 71%|██████████████████████████████████████████████████████████████                         | 1248/1751 [20:52:14<8:09:34, 58.40s/it] 71%|██████████████████████████████████████████████████████████████                         | 1249/1751 [20:53:12<8:08:55, 58.44s/it]                                                                                                                                     {'loss': '0.5593', 'grad_norm': '0.1787', 'learning_rate': '4.18e-06', 'ppl': '1.75', 'memory/max_active (GiB)': '70.64', 'memory/max_allocated (GiB)': '70.64', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '30.35', 'tokens/total': 2368974336, 'tokens/trainable': 875972672, 'epoch': '2.139'}
 71%|██████████████████████████████████████████████████████████████                         | 1249/1751 [20:53:12<8:08:55, 58.44s/it] 71%|██████████████████████████████████████████████████████████████                         | 1250/1751 [20:54:14<8:15:10, 59.30s/it]                                                                                                                                     {'loss': '0.5029', 'grad_norm': '0.1719', 'learning_rate': '4.165e-06', 'ppl': '1.654', 'memory/max_active (GiB)': '72.6', 'memory/max_allocated (GiB)': '72.6', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '109.1', 'tokens/total': 2370962944, 'tokens/trainable': 876722816, 'epoch': '2.141'}
 71%|██████████████████████████████████████████████████████████████                         | 1250/1751 [20:54:14<8:15:10, 59.30s/it] 71%|██████████████████████████████████████████████████████████████▏                        | 1251/1751 [20:55:14<8:17:19, 59.68s/it]                                                                                                                                     {'loss': '0.5071', 'grad_norm': '0.1758', 'learning_rate': '4.15e-06', 'ppl': '1.66', 'memory/max_active (GiB)': '75.94', 'memory/max_allocated (GiB)': '75.94', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '47.61', 'tokens/total': 2372868864, 'tokens/trainable': 877444032, 'epoch': '2.142'}
 71%|██████████████████████████████████████████████████████████████▏                        | 1251/1751 [20:55:14<8:17:19, 59.68s/it] 72%|██████████████████████████████████████████████████████████████▏                        | 1252/1751 [20:56:12<8:12:17, 59.19s/it]                                                                                                                                     {'loss': '0.5131', 'grad_norm': '0.1797', 'learning_rate': '4.134e-06', 'ppl': '1.67', 'memory/max_active (GiB)': '71.23', 'memory/max_allocated (GiB)': '71.23', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '34.61', 'tokens/total': 2374694400, 'tokens/trainable': 878123776, 'epoch': '2.144'}
 72%|██████████████████████████████████████████████████████████████▏                        | 1252/1751 [20:56:12<8:12:17, 59.19s/it] 72%|██████████████████████████████████████████████████████████████▎                        | 1253/1751 [20:57:12<8:12:04, 59.29s/it]                                                                                                                                     {'loss': '0.5062', 'grad_norm': '0.166', 'learning_rate': '4.119e-06', 'ppl': '1.659', 'memory/max_active (GiB)': '77.21', 'memory/max_allocated (GiB)': '77.21', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '96.91', 'tokens/total': 2376615936, 'tokens/trainable': 878857856, 'epoch': '2.146'}
 72%|██████████████████████████████████████████████████████████████▎                        | 1253/1751 [20:57:12<8:12:04, 59.29s/it] 72%|██████████████████████████████████████████████████████████████▎                        | 1254/1751 [20:58:09<8:05:35, 58.62s/it]                                                                                                                                     {'loss': '0.5036', 'grad_norm': '0.1836', 'learning_rate': '4.104e-06', 'ppl': '1.655', 'memory/max_active (GiB)': '68.17', 'memory/max_allocated (GiB)': '68.17', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '44.08', 'tokens/total': 2378401280, 'tokens/trainable': 879506880, 'epoch': '2.147'}
 72%|██████████████████████████████████████████████████████████████▎                        | 1254/1751 [20:58:09<8:05:35, 58.62s/it] 72%|██████████████████████████████████████████████████████████████▎                        | 1255/1751 [20:59:07<8:03:02, 58.43s/it]                                                                                                                                     {'loss': '0.5163', 'grad_norm': '0.1934', 'learning_rate': '4.089e-06', 'ppl': '1.676', 'memory/max_active (GiB)': '75.24', 'memory/max_allocated (GiB)': '75.24', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '40.71', 'tokens/total': 2380219136, 'tokens/trainable': 880168256, 'epoch': '2.149'}
 72%|██████████████████████████████████████████████████████████████▎                        | 1255/1751 [20:59:07<8:03:02, 58.43s/it] 72%|██████████████████████████████████████████████████████████████▍                        | 1256/1751 [21:00:06<8:04:01, 58.67s/it]                                                                                                                                     {'loss': '0.5364', 'grad_norm': '0.1855', 'learning_rate': '4.073e-06', 'ppl': '1.71', 'memory/max_active (GiB)': '75.21', 'memory/max_allocated (GiB)': '75.21', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '15.63', 'tokens/total': 2382036224, 'tokens/trainable': 880826112, 'epoch': '2.151'}
 72%|██████████████████████████████████████████████████████████████▍                        | 1256/1751 [21:00:06<8:04:01, 58.67s/it] 72%|██████████████████████████████████████████████████████████████▍                        | 1257/1751 [21:01:05<8:03:45, 58.76s/it]                                                                                                                                     {'loss': '0.4912', 'grad_norm': '0.1816', 'learning_rate': '4.058e-06', 'ppl': '1.634', 'memory/max_active (GiB)': '70.36', 'memory/max_allocated (GiB)': '70.36', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '90.82', 'tokens/total': 2383865088, 'tokens/trainable': 881475008, 'epoch': '2.153'}
 72%|██████████████████████████████████████████████████████████████▍                        | 1257/1751 [21:01:05<8:03:45, 58.76s/it] 72%|██████████████████████████████████████████████████████████████▌                        | 1258/1751 [21:02:04<8:04:06, 58.92s/it]                                                                                                                                     {'loss': '0.5056', 'grad_norm': '0.1855', 'learning_rate': '4.043e-06', 'ppl': '1.658', 'memory/max_active (GiB)': '70.07', 'memory/max_allocated (GiB)': '70.07', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '38.19', 'tokens/total': 2385719040, 'tokens/trainable': 882163648, 'epoch': '2.154'}
 72%|██████████████████████████████████████████████████████████████▌                        | 1258/1751 [21:02:04<8:04:06, 58.92s/it] 72%|██████████████████████████████████████████████████████████████▌                        | 1259/1751 [21:03:03<8:02:13, 58.81s/it]                                                                                                                                     {'loss': '0.5319', 'grad_norm': '0.1865', 'learning_rate': '4.028e-06', 'ppl': '1.702', 'memory/max_active (GiB)': '73.25', 'memory/max_allocated (GiB)': '73.25', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '106.5', 'tokens/total': 2387595008, 'tokens/trainable': 882846720, 'epoch': '2.156'}
 72%|██████████████████████████████████████████████████████████████▌                        | 1259/1751 [21:03:03<8:02:13, 58.81s/it] 72%|██████████████████████████████████████████████████████████████▌                        | 1260/1751 [21:04:00<7:57:11, 58.31s/it]                                                                                                                                     {'loss': '0.5316', 'grad_norm': '0.1777', 'learning_rate': '4.013e-06', 'ppl': '1.702', 'memory/max_active (GiB)': '66.34', 'memory/max_allocated (GiB)': '66.34', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '95.63', 'tokens/total': 2389405952, 'tokens/trainable': 883544000, 'epoch': '2.158'}
 72%|██████████████████████████████████████████████████████████████▌                        | 1260/1751 [21:04:00<7:57:11, 58.31s/it] 72%|██████████████████████████████████████████████████████████████▋                        | 1261/1751 [21:05:00<8:01:11, 58.92s/it]                                                                                                                                     {'loss': '0.5057', 'grad_norm': '0.1807', 'learning_rate': '3.998e-06', 'ppl': '1.658', 'memory/max_active (GiB)': '73.4', 'memory/max_allocated (GiB)': '73.4', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '121.3', 'tokens/total': 2391341568, 'tokens/trainable': 884252160, 'epoch': '2.159'}
 72%|██████████████████████████████████████████████████████████████▋                        | 1261/1751 [21:05:00<8:01:11, 58.92s/it] 72%|██████████████████████████████████████████████████████████████▋                        | 1262/1751 [21:06:00<8:00:36, 58.97s/it]                                                                                                                                     {'loss': '0.4896', 'grad_norm': '0.1729', 'learning_rate': '3.983e-06', 'ppl': '1.632', 'memory/max_active (GiB)': '78.14', 'memory/max_allocated (GiB)': '78.14', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '146.8', 'tokens/total': 2393245696, 'tokens/trainable': 884948160, 'epoch': '2.161'}
 72%|██████████████████████████████████████████████████████████████▋                        | 1262/1751 [21:06:00<8:00:36, 58.97s/it] 72%|██████████████████████████████████████████████████████████████▊                        | 1263/1751 [21:07:00<8:02:33, 59.33s/it]                                                                                                                                     {'loss': '0.5201', 'grad_norm': '0.1816', 'learning_rate': '3.967e-06', 'ppl': '1.682', 'memory/max_active (GiB)': '75.96', 'memory/max_allocated (GiB)': '75.96', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '56.45', 'tokens/total': 2395192320, 'tokens/trainable': 885680192, 'epoch': '2.163'}
 72%|██████████████████████████████████████████████████████████████▊                        | 1263/1751 [21:07:00<8:02:33, 59.33s/it] 72%|██████████████████████████████████████████████████████████████▊                        | 1264/1751 [21:07:59<8:02:36, 59.46s/it]                                                                                                                                     {'loss': '0.5075', 'grad_norm': '0.1768', 'learning_rate': '3.952e-06', 'ppl': '1.661', 'memory/max_active (GiB)': '72.57', 'memory/max_allocated (GiB)': '72.57', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '134.8', 'tokens/total': 2397090816, 'tokens/trainable': 886393792, 'epoch': '2.165'}
 72%|██████████████████████████████████████████████████████████████▊                        | 1264/1751 [21:08:00<8:02:36, 59.46s/it] 72%|██████████████████████████████████████████████████████████████▊                        | 1265/1751 [21:08:58<8:00:17, 59.30s/it]                                                                                                                                     {'loss': '0.5266', 'grad_norm': '0.4434', 'learning_rate': '3.937e-06', 'ppl': '1.693', 'memory/max_active (GiB)': '72.69', 'memory/max_allocated (GiB)': '72.69', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '32.69', 'tokens/total': 2398966016, 'tokens/trainable': 887083008, 'epoch': '2.166'}
 72%|██████████████████████████████████████████████████████████████▊                        | 1265/1751 [21:08:58<8:00:17, 59.30s/it] 72%|██████████████████████████████████████████████████████████████▉                        | 1266/1751 [21:09:59<8:03:32, 59.82s/it]                                                                                                                                     {'loss': '0.4886', 'grad_norm': '0.1758', 'learning_rate': '3.922e-06', 'ppl': '1.63', 'memory/max_active (GiB)': '77.7', 'memory/max_allocated (GiB)': '77.7', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '58.98', 'tokens/total': 2400918528, 'tokens/trainable': 887821952, 'epoch': '2.168'}
 72%|██████████████████████████████████████████████████████████████▉                        | 1266/1751 [21:09:59<8:03:32, 59.82s/it] 72%|██████████████████████████████████████████████████████████████▉                        | 1267/1751 [21:10:57<7:57:38, 59.21s/it]                                                                                                                                     {'loss': '0.5609', 'grad_norm': '0.1924', 'learning_rate': '3.907e-06', 'ppl': '1.752', 'memory/max_active (GiB)': '76.58', 'memory/max_allocated (GiB)': '76.58', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '76.08', 'tokens/total': 2402714880, 'tokens/trainable': 888454528, 'epoch': '2.17'}
 72%|██████████████████████████████████████████████████████████████▉                        | 1267/1751 [21:10:57<7:57:38, 59.21s/it] 72%|███████████████████████████████████████████████████████████████                        | 1268/1751 [21:11:58<8:00:56, 59.75s/it]                                                                                                                                     {'loss': '0.4976', 'grad_norm': '0.1816', 'learning_rate': '3.892e-06', 'ppl': '1.645', 'memory/max_active (GiB)': '72.81', 'memory/max_allocated (GiB)': '72.81', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '133.5', 'tokens/total': 2404649216, 'tokens/trainable': 889135552, 'epoch': '2.171'}
 72%|███████████████████████████████████████████████████████████████                        | 1268/1751 [21:11:58<8:00:56, 59.75s/it] 72%|███████████████████████████████████████████████████████████████                        | 1269/1751 [21:12:58<7:59:19, 59.67s/it]                                                                                                                                     {'loss': '0.5078', 'grad_norm': '0.1787', 'learning_rate': '3.877e-06', 'ppl': '1.662', 'memory/max_active (GiB)': '69.8', 'memory/max_allocated (GiB)': '69.8', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '122.9', 'tokens/total': 2406568704, 'tokens/trainable': 889840192, 'epoch': '2.173'}
 72%|███████████████████████████████████████████████████████████████                        | 1269/1751 [21:12:58<7:59:19, 59.67s/it] 73%|███████████████████████████████████████████████████████████████                        | 1270/1751 [21:13:57<7:56:34, 59.45s/it]                                                                                                                                     {'loss': '0.5212', 'grad_norm': '0.1807', 'learning_rate': '3.863e-06', 'ppl': '1.684', 'memory/max_active (GiB)': '77.29', 'memory/max_allocated (GiB)': '77.29', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '149.9', 'tokens/total': 2408484608, 'tokens/trainable': 890562112, 'epoch': '2.175'}
 73%|███████████████████████████████████████████████████████████████                        | 1270/1751 [21:13:57<7:56:34, 59.45s/it] 73%|███████████████████████████████████████████████████████████████▏                       | 1271/1751 [21:14:56<7:54:36, 59.33s/it]                                                                                                                                     {'loss': '0.5227', 'grad_norm': '0.1846', 'learning_rate': '3.848e-06', 'ppl': '1.687', 'memory/max_active (GiB)': '72.03', 'memory/max_allocated (GiB)': '72.03', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '39.79', 'tokens/total': 2410379264, 'tokens/trainable': 891267264, 'epoch': '2.176'}
 73%|███████████████████████████████████████████████████████████████▏                       | 1271/1751 [21:14:56<7:54:36, 59.33s/it] 73%|███████████████████████████████████████████████████████████████▏                       | 1272/1751 [21:15:53<7:48:29, 58.68s/it]                                                                                                                                     {'loss': '0.5355', 'grad_norm': '0.1797', 'learning_rate': '3.833e-06', 'ppl': '1.708', 'memory/max_active (GiB)': '73.56', 'memory/max_allocated (GiB)': '73.56', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '173.7', 'tokens/total': 2412163584, 'tokens/trainable': 891949376, 'epoch': '2.178'}
 73%|███████████████████████████████████████████████████████████████▏                       | 1272/1751 [21:15:53<7:48:29, 58.68s/it] 73%|███████████████████████████████████████████████████████████████▎                       | 1273/1751 [21:16:53<7:50:54, 59.11s/it]                                                                                                                                     {'loss': '0.5238', 'grad_norm': '0.1875', 'learning_rate': '3.818e-06', 'ppl': '1.689', 'memory/max_active (GiB)': '74.76', 'memory/max_allocated (GiB)': '74.76', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '65.3', 'tokens/total': 2414113792, 'tokens/trainable': 892651072, 'epoch': '2.18'}
 73%|███████████████████████████████████████████████████████████████▎                       | 1273/1751 [21:16:53<7:50:54, 59.11s/it] 73%|███████████████████████████████████████████████████████████████▎                       | 1274/1751 [21:17:55<7:56:26, 59.93s/it]                                                                                                                                     {'loss': '0.5108', 'grad_norm': '0.1777', 'learning_rate': '3.803e-06', 'ppl': '1.667', 'memory/max_active (GiB)': '74.63', 'memory/max_allocated (GiB)': '74.63', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '205.2', 'tokens/total': 2416162304, 'tokens/trainable': 893381696, 'epoch': '2.182'}
 73%|███████████████████████████████████████████████████████████████▎                       | 1274/1751 [21:17:55<7:56:26, 59.93s/it] 73%|███████████████████████████████████████████████████████████████▎                       | 1275/1751 [21:18:54<7:53:11, 59.65s/it]                                                                                                                                     {'loss': '0.5046', 'grad_norm': '0.1768', 'learning_rate': '3.788e-06', 'ppl': '1.656', 'memory/max_active (GiB)': '74.85', 'memory/max_allocated (GiB)': '74.85', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '223.3', 'tokens/total': 2418051840, 'tokens/trainable': 894083456, 'epoch': '2.183'}
 73%|███████████████████████████████████████████████████████████████▎                       | 1275/1751 [21:18:54<7:53:11, 59.65s/it] 73%|███████████████████████████████████████████████████████████████▍                       | 1276/1751 [21:19:51<7:45:56, 58.86s/it]                                                                                                                                     {'loss': '0.533', 'grad_norm': '0.1836', 'learning_rate': '3.774e-06', 'ppl': '1.704', 'memory/max_active (GiB)': '77.77', 'memory/max_allocated (GiB)': '77.77', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '77.02', 'tokens/total': 2419876352, 'tokens/trainable': 894772160, 'epoch': '2.185'}
 73%|███████████████████████████████████████████████████████████████▍                       | 1276/1751 [21:19:51<7:45:56, 58.86s/it] 73%|███████████████████████████████████████████████████████████████▍                       | 1277/1751 [21:20:52<7:51:15, 59.65s/it]                                                                                                                                     {'loss': '0.526', 'grad_norm': '0.1641', 'learning_rate': '3.759e-06', 'ppl': '1.692', 'memory/max_active (GiB)': '72.29', 'memory/max_allocated (GiB)': '72.29', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '106.5', 'tokens/total': 2421827328, 'tokens/trainable': 895510784, 'epoch': '2.187'}
 73%|███████████████████████████████████████████████████████████████▍                       | 1277/1751 [21:20:52<7:51:15, 59.65s/it] 73%|███████████████████████████████████████████████████████████████▍                       | 1278/1751 [21:21:53<7:52:10, 59.90s/it]                                                                                                                                     {'loss': '0.496', 'grad_norm': '0.1729', 'learning_rate': '3.744e-06', 'ppl': '1.642', 'memory/max_active (GiB)': '72.1', 'memory/max_allocated (GiB)': '72.1', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '46.48', 'tokens/total': 2423754240, 'tokens/trainable': 896213632, 'epoch': '2.188'}
 73%|███████████████████████████████████████████████████████████████▍                       | 1278/1751 [21:21:53<7:52:10, 59.90s/it] 73%|███████████████████████████████████████████████████████████████▌                       | 1279/1751 [21:22:55<7:55:58, 60.51s/it]                                                                                                                                     {'loss': '0.4742', 'grad_norm': '0.1738', 'learning_rate': '3.729e-06', 'ppl': '1.607', 'memory/max_active (GiB)': '74.38', 'memory/max_allocated (GiB)': '74.38', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '95.26', 'tokens/total': 2425763072, 'tokens/trainable': 896973824, 'epoch': '2.19'}
 73%|███████████████████████████████████████████████████████████████▌                       | 1279/1751 [21:22:55<7:55:58, 60.51s/it] 73%|███████████████████████████████████████████████████████████████▌                       | 1280/1751 [21:23:55<7:54:42, 60.47s/it]                                                                                                                                     {'loss': '0.523', 'grad_norm': '0.1865', 'learning_rate': '3.715e-06', 'ppl': '1.687', 'memory/max_active (GiB)': '76.04', 'memory/max_allocated (GiB)': '76.04', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '38.69', 'tokens/total': 2427690496, 'tokens/trainable': 897663232, 'epoch': '2.192'}
 73%|███████████████████████████████████████████████████████████████▌                       | 1280/1751 [21:23:55<7:54:42, 60.47s/it] 73%|███████████████████████████████████████████████████████████████▋                       | 1281/1751 [21:24:54<7:50:07, 60.02s/it]                                                                                                                                     {'loss': '0.5464', 'grad_norm': '0.1777', 'learning_rate': '3.7e-06', 'ppl': '1.727', 'memory/max_active (GiB)': '75.16', 'memory/max_allocated (GiB)': '75.16', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '40.48', 'tokens/total': 2429598720, 'tokens/trainable': 898371072, 'epoch': '2.194'}
 73%|███████████████████████████████████████████████████████████████▋                       | 1281/1751 [21:24:54<7:50:07, 60.02s/it] 73%|███████████████████████████████████████████████████████████████▋                       | 1282/1751 [21:25:51<7:41:39, 59.06s/it]                                                                                                                                     {'loss': '0.54', 'grad_norm': '0.207', 'learning_rate': '3.685e-06', 'ppl': '1.716', 'memory/max_active (GiB)': '75.21', 'memory/max_allocated (GiB)': '75.21', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '24.02', 'tokens/total': 2431369728, 'tokens/trainable': 898997824, 'epoch': '2.195'}
 73%|███████████████████████████████████████████████████████████████▋                       | 1282/1751 [21:25:51<7:41:39, 59.06s/it] 73%|███████████████████████████████████████████████████████████████▋                       | 1283/1751 [21:26:53<7:47:45, 59.97s/it]                                                                                                                                     {'loss': '0.5511', 'grad_norm': '0.1865', 'learning_rate': '3.671e-06', 'ppl': '1.735', 'memory/max_active (GiB)': '70.86', 'memory/max_allocated (GiB)': '70.86', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '103.6', 'tokens/total': 2433371904, 'tokens/trainable': 899763776, 'epoch': '2.197'}
 73%|███████████████████████████████████████████████████████████████▋                       | 1283/1751 [21:26:53<7:47:45, 59.97s/it] 73%|███████████████████████████████████████████████████████████████▊                       | 1284/1751 [21:27:50<7:39:57, 59.09s/it]                                                                                                                                     {'loss': '0.5559', 'grad_norm': '0.1816', 'learning_rate': '3.656e-06', 'ppl': '1.744', 'memory/max_active (GiB)': '70.01', 'memory/max_allocated (GiB)': '70.01', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '42.17', 'tokens/total': 2435122688, 'tokens/trainable': 900429632, 'epoch': '2.199'}
 73%|███████████████████████████████████████████████████████████████▊                       | 1284/1751 [21:27:50<7:39:57, 59.09s/it] 73%|███████████████████████████████████████████████████████████████▊                       | 1285/1751 [21:28:50<7:40:51, 59.34s/it]                                                                                                                                     {'loss': '0.5496', 'grad_norm': '0.1924', 'learning_rate': '3.641e-06', 'ppl': '1.733', 'memory/max_active (GiB)': '72.18', 'memory/max_allocated (GiB)': '72.18', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '70.2', 'tokens/total': 2437082368, 'tokens/trainable': 901157568, 'epoch': '2.2'}
 73%|███████████████████████████████████████████████████████████████▊                       | 1285/1751 [21:28:50<7:40:51, 59.34s/it] 73%|███████████████████████████████████████████████████████████████▉                       | 1286/1751 [21:29:50<7:40:25, 59.41s/it]                                                                                                                                     {'loss': '0.5078', 'grad_norm': '0.1787', 'learning_rate': '3.627e-06', 'ppl': '1.662', 'memory/max_active (GiB)': '73.71', 'memory/max_allocated (GiB)': '73.71', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '52.05', 'tokens/total': 2438996736, 'tokens/trainable': 901851904, 'epoch': '2.202'}
 73%|███████████████████████████████████████████████████████████████▉                       | 1286/1751 [21:29:50<7:40:25, 59.41s/it] 74%|███████████████████████████████████████████████████████████████▉                       | 1287/1751 [21:30:50<7:40:48, 59.59s/it]                                                                                                                                     {'loss': '0.5157', 'grad_norm': '0.1787', 'learning_rate': '3.612e-06', 'ppl': '1.675', 'memory/max_active (GiB)': '68.52', 'memory/max_allocated (GiB)': '68.52', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '104.8', 'tokens/total': 2440930816, 'tokens/trainable': 902571840, 'epoch': '2.204'}
 74%|███████████████████████████████████████████████████████████████▉                       | 1287/1751 [21:30:50<7:40:48, 59.59s/it] 74%|███████████████████████████████████████████████████████████████▉                       | 1288/1751 [21:31:49<7:38:34, 59.43s/it]                                                                                                                                     {'loss': '0.5293', 'grad_norm': '0.1836', 'learning_rate': '3.598e-06', 'ppl': '1.698', 'memory/max_active (GiB)': '75.74', 'memory/max_allocated (GiB)': '75.74', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '109.3', 'tokens/total': 2442806272, 'tokens/trainable': 903261504, 'epoch': '2.206'}
 74%|███████████████████████████████████████████████████████████████▉                       | 1288/1751 [21:31:49<7:38:34, 59.43s/it] 74%|████████████████████████████████████████████████████████████████                       | 1289/1751 [21:32:47<7:34:41, 59.05s/it]                                                                                                                                     {'loss': '0.494', 'grad_norm': '0.1758', 'learning_rate': '3.583e-06', 'ppl': '1.639', 'memory/max_active (GiB)': '75.84', 'memory/max_allocated (GiB)': '75.84', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '129.9', 'tokens/total': 2444656128, 'tokens/trainable': 903944576, 'epoch': '2.207'}
 74%|████████████████████████████████████████████████████████████████                       | 1289/1751 [21:32:47<7:34:41, 59.05s/it] 74%|████████████████████████████████████████████████████████████████                       | 1290/1751 [21:33:45<7:30:56, 58.69s/it]                                                                                                                                     {'loss': '0.5011', 'grad_norm': '0.1719', 'learning_rate': '3.569e-06', 'ppl': '1.651', 'memory/max_active (GiB)': '71.82', 'memory/max_allocated (GiB)': '71.82', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '169.5', 'tokens/total': 2446498048, 'tokens/trainable': 904618496, 'epoch': '2.209'}
 74%|████████████████████████████████████████████████████████████████                       | 1290/1751 [21:33:45<7:30:56, 58.69s/it] 74%|████████████████████████████████████████████████████████████████▏                      | 1291/1751 [21:34:41<7:24:51, 58.02s/it]                                                                                                                                     {'loss': '0.5652', 'grad_norm': '0.1895', 'learning_rate': '3.554e-06', 'ppl': '1.76', 'memory/max_active (GiB)': '69.69', 'memory/max_allocated (GiB)': '69.69', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '113.4', 'tokens/total': 2448309248, 'tokens/trainable': 905288768, 'epoch': '2.211'}
 74%|████████████████████████████████████████████████████████████████▏                      | 1291/1751 [21:34:41<7:24:51, 58.02s/it] 74%|████████████████████████████████████████████████████████████████▏                      | 1292/1751 [21:35:39<7:23:45, 58.01s/it]                                                                                                                                     {'loss': '0.4864', 'grad_norm': '0.1709', 'learning_rate': '3.54e-06', 'ppl': '1.626', 'memory/max_active (GiB)': '74.42', 'memory/max_allocated (GiB)': '74.42', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '110.7', 'tokens/total': 2450192384, 'tokens/trainable': 905998784, 'epoch': '2.212'}
 74%|████████████████████████████████████████████████████████████████▏                      | 1292/1751 [21:35:39<7:23:45, 58.01s/it] 74%|████████████████████████████████████████████████████████████████▏                      | 1293/1751 [21:36:37<7:22:23, 57.95s/it]                                                                                                                                     {'loss': '0.5384', 'grad_norm': '0.1787', 'learning_rate': '3.526e-06', 'ppl': '1.713', 'memory/max_active (GiB)': '72.53', 'memory/max_allocated (GiB)': '72.53', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '49.4', 'tokens/total': 2452067072, 'tokens/trainable': 906703104, 'epoch': '2.214'}
 74%|████████████████████████████████████████████████████████████████▏                      | 1293/1751 [21:36:37<7:22:23, 57.95s/it] 74%|████████████████████████████████████████████████████████████████▎                      | 1294/1751 [21:37:34<7:18:34, 57.58s/it]                                                                                                                                     {'loss': '0.5224', 'grad_norm': '0.1973', 'learning_rate': '3.511e-06', 'ppl': '1.686', 'memory/max_active (GiB)': '76.58', 'memory/max_allocated (GiB)': '76.58', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '34.83', 'tokens/total': 2453915904, 'tokens/trainable': 907385088, 'epoch': '2.216'}
 74%|████████████████████████████████████████████████████████████████▎                      | 1294/1751 [21:37:34<7:18:34, 57.58s/it] 74%|████████████████████████████████████████████████████████████████▎                      | 1295/1751 [21:38:33<7:21:31, 58.10s/it]                                                                                                                                     {'loss': '0.525', 'grad_norm': '0.1768', 'learning_rate': '3.497e-06', 'ppl': '1.69', 'memory/max_active (GiB)': '74.5', 'memory/max_allocated (GiB)': '74.5', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '112.6', 'tokens/total': 2455837440, 'tokens/trainable': 908106176, 'epoch': '2.218'}
 74%|████████████████████████████████████████████████████████████████▎                      | 1295/1751 [21:38:33<7:21:31, 58.10s/it] 74%|████████████████████████████████████████████████████████████████▍                      | 1296/1751 [21:39:30<7:17:59, 57.76s/it]                                                                                                                                     {'loss': '0.5531', 'grad_norm': '0.1943', 'learning_rate': '3.483e-06', 'ppl': '1.739', 'memory/max_active (GiB)': '77.12', 'memory/max_allocated (GiB)': '77.12', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '190', 'tokens/total': 2457647104, 'tokens/trainable': 908766144, 'epoch': '2.219'}
 74%|████████████████████████████████████████████████████████████████▍                      | 1296/1751 [21:39:30<7:17:59, 57.76s/it] 74%|████████████████████████████████████████████████████████████████▍                      | 1297/1751 [21:40:32<7:26:47, 59.05s/it]                                                                                                                                     {'loss': '0.465', 'grad_norm': '0.1777', 'learning_rate': '3.468e-06', 'ppl': '1.592', 'memory/max_active (GiB)': '74.96', 'memory/max_allocated (GiB)': '74.96', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '87.81', 'tokens/total': 2459676160, 'tokens/trainable': 909515264, 'epoch': '2.221'}
 74%|████████████████████████████████████████████████████████████████▍                      | 1297/1751 [21:40:32<7:26:47, 59.05s/it] 74%|████████████████████████████████████████████████████████████████▍                      | 1298/1751 [21:41:35<7:35:35, 60.34s/it]                                                                                                                                     {'loss': '0.4669', 'grad_norm': '0.1641', 'learning_rate': '3.454e-06', 'ppl': '1.595', 'memory/max_active (GiB)': '76.55', 'memory/max_allocated (GiB)': '76.55', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '14.75', 'tokens/total': 2461731584, 'tokens/trainable': 910301248, 'epoch': '2.223'}
 74%|████████████████████████████████████████████████████████████████▍                      | 1298/1751 [21:41:35<7:35:35, 60.34s/it] 74%|████████████████████████████████████████████████████████████████▌                      | 1299/1751 [21:42:38<7:39:37, 61.01s/it]                                                                                                                                     {'loss': '0.5022', 'grad_norm': '0.1777', 'learning_rate': '3.44e-06', 'ppl': '1.652', 'memory/max_active (GiB)': '69.88', 'memory/max_allocated (GiB)': '69.88', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '56.32', 'tokens/total': 2463669504, 'tokens/trainable': 911010816, 'epoch': '2.224'}
 74%|████████████████████████████████████████████████████████████████▌                      | 1299/1751 [21:42:38<7:39:37, 61.01s/it] 74%|████████████████████████████████████████████████████████████████▌                      | 1300/1751 [21:43:40<7:41:46, 61.43s/it]                                                                                                                                     {'loss': '0.499', 'grad_norm': '0.165', 'learning_rate': '3.425e-06', 'ppl': '1.647', 'memory/max_active (GiB)': '74.87', 'memory/max_allocated (GiB)': '74.87', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '47.54', 'tokens/total': 2465648640, 'tokens/trainable': 911753536, 'epoch': '2.226'}
 74%|████████████████████████████████████████████████████████████████▌                      | 1300/1751 [21:43:40<7:41:46, 61.43s/it] 74%|████████████████████████████████████████████████████████████████▋                      | 1301/1751 [21:44:40<7:37:11, 60.96s/it]                                                                                                                                     {'loss': '0.5146', 'grad_norm': '0.1816', 'learning_rate': '3.411e-06', 'ppl': '1.673', 'memory/max_active (GiB)': '74.64', 'memory/max_allocated (GiB)': '74.64', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '49.53', 'tokens/total': 2467508736, 'tokens/trainable': 912460864, 'epoch': '2.228'}
 74%|████████████████████████████████████████████████████████████████▋                      | 1301/1751 [21:44:40<7:37:11, 60.96s/it] 74%|████████████████████████████████████████████████████████████████▋                      | 1302/1751 [21:45:39<7:30:28, 60.20s/it]                                                                                                                                     {'loss': '0.5224', 'grad_norm': '0.1758', 'learning_rate': '3.397e-06', 'ppl': '1.686', 'memory/max_active (GiB)': '73.46', 'memory/max_allocated (GiB)': '73.46', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '107.2', 'tokens/total': 2469366784, 'tokens/trainable': 913165376, 'epoch': '2.23'}
 74%|████████████████████████████████████████████████████████████████▋                      | 1302/1751 [21:45:39<7:30:28, 60.20s/it] 74%|████████████████████████████████████████████████████████████████▋                      | 1303/1751 [21:46:39<7:29:24, 60.19s/it]                                                                                                                                     {'loss': '0.5176', 'grad_norm': '0.1787', 'learning_rate': '3.383e-06', 'ppl': '1.678', 'memory/max_active (GiB)': '72.13', 'memory/max_allocated (GiB)': '72.13', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '88.68', 'tokens/total': 2471291392, 'tokens/trainable': 913865152, 'epoch': '2.231'}
 74%|████████████████████████████████████████████████████████████████▋                      | 1303/1751 [21:46:39<7:29:24, 60.19s/it] 74%|████████████████████████████████████████████████████████████████▊                      | 1304/1751 [21:47:40<7:30:24, 60.46s/it]                                                                                                                                     {'loss': '0.53', 'grad_norm': '0.1719', 'learning_rate': '3.369e-06', 'ppl': '1.699', 'memory/max_active (GiB)': '75.19', 'memory/max_allocated (GiB)': '75.19', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '108', 'tokens/total': 2473216256, 'tokens/trainable': 914618368, 'epoch': '2.233'}
 74%|████████████████████████████████████████████████████████████████▊                      | 1304/1751 [21:47:40<7:30:24, 60.46s/it] 75%|████████████████████████████████████████████████████████████████▊                      | 1305/1751 [21:48:37<7:22:35, 59.54s/it]                                                                                                                                     {'loss': '0.5504', 'grad_norm': '0.1816', 'learning_rate': '3.355e-06', 'ppl': '1.734', 'memory/max_active (GiB)': '77.1', 'memory/max_allocated (GiB)': '77.1', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '41.12', 'tokens/total': 2475004928, 'tokens/trainable': 915304512, 'epoch': '2.235'}
 75%|████████████████████████████████████████████████████████████████▊                      | 1305/1751 [21:48:37<7:22:35, 59.54s/it] 75%|████████████████████████████████████████████████████████████████▉                      | 1306/1751 [21:49:38<7:24:31, 59.94s/it]                                                                                                                                     {'loss': '0.5232', 'grad_norm': '0.1709', 'learning_rate': '3.341e-06', 'ppl': '1.687', 'memory/max_active (GiB)': '75.42', 'memory/max_allocated (GiB)': '75.42', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '175.7', 'tokens/total': 2476976896, 'tokens/trainable': 916037504, 'epoch': '2.236'}
 75%|████████████████████████████████████████████████████████████████▉                      | 1306/1751 [21:49:38<7:24:31, 59.94s/it] 75%|████████████████████████████████████████████████████████████████▉                      | 1307/1751 [21:50:40<7:27:49, 60.52s/it]                                                                                                                                     {'loss': '0.5179', 'grad_norm': '0.1836', 'learning_rate': '3.326e-06', 'ppl': '1.678', 'memory/max_active (GiB)': '76.12', 'memory/max_allocated (GiB)': '76.12', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '131.7', 'tokens/total': 2478977536, 'tokens/trainable': 916758720, 'epoch': '2.238'}
 75%|████████████████████████████████████████████████████████████████▉                      | 1307/1751 [21:50:40<7:27:49, 60.52s/it] 75%|████████████████████████████████████████████████████████████████▉                      | 1308/1751 [21:51:37<7:18:26, 59.38s/it]                                                                                                                                     {'loss': '0.5213', 'grad_norm': '0.1934', 'learning_rate': '3.312e-06', 'ppl': '1.684', 'memory/max_active (GiB)': '75.48', 'memory/max_allocated (GiB)': '75.48', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '26.28', 'tokens/total': 2480734464, 'tokens/trainable': 917412928, 'epoch': '2.24'}
 75%|████████████████████████████████████████████████████████████████▉                      | 1308/1751 [21:51:37<7:18:26, 59.38s/it] 75%|█████████████████████████████████████████████████████████████████                      | 1309/1751 [21:52:36<7:17:00, 59.32s/it]                                                                                                                                     {'loss': '0.5202', 'grad_norm': '0.1729', 'learning_rate': '3.298e-06', 'ppl': '1.682', 'memory/max_active (GiB)': '76.26', 'memory/max_allocated (GiB)': '76.26', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '84', 'tokens/total': 2482626560, 'tokens/trainable': 918110400, 'epoch': '2.242'}
 75%|█████████████████████████████████████████████████████████████████                      | 1309/1751 [21:52:36<7:17:00, 59.32s/it] 75%|█████████████████████████████████████████████████████████████████                      | 1310/1751 [21:53:34<7:12:30, 58.84s/it]                                                                                                                                     {'loss': '0.5581', 'grad_norm': '0.1924', 'learning_rate': '3.284e-06', 'ppl': '1.747', 'memory/max_active (GiB)': '76.01', 'memory/max_allocated (GiB)': '76.01', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '158.4', 'tokens/total': 2484422912, 'tokens/trainable': 918763392, 'epoch': '2.243'}
 75%|█████████████████████████████████████████████████████████████████                      | 1310/1751 [21:53:34<7:12:30, 58.84s/it] 75%|█████████████████████████████████████████████████████████████████▏                     | 1311/1751 [21:54:33<7:13:08, 59.06s/it]                                                                                                                                     {'loss': '0.5271', 'grad_norm': '0.1807', 'learning_rate': '3.27e-06', 'ppl': '1.694', 'memory/max_active (GiB)': '77.76', 'memory/max_allocated (GiB)': '77.76', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '115.6', 'tokens/total': 2486313728, 'tokens/trainable': 919444096, 'epoch': '2.245'}
 75%|█████████████████████████████████████████████████████████████████▏                     | 1311/1751 [21:54:33<7:13:08, 59.06s/it] 75%|█████████████████████████████████████████████████████████████████▏                     | 1312/1751 [21:55:30<7:07:14, 58.39s/it]                                                                                                                                     {'loss': '0.5587', 'grad_norm': '0.2012', 'learning_rate': '3.256e-06', 'ppl': '1.748', 'memory/max_active (GiB)': '70.76', 'memory/max_allocated (GiB)': '70.76', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '107', 'tokens/total': 2488079104, 'tokens/trainable': 920069888, 'epoch': '2.247'}
 75%|█████████████████████████████████████████████████████████████████▏                     | 1312/1751 [21:55:30<7:07:14, 58.39s/it] 75%|█████████████████████████████████████████████████████████████████▏                     | 1313/1751 [21:56:28<7:04:24, 58.14s/it]                                                                                                                                     {'loss': '0.526', 'grad_norm': '0.1816', 'learning_rate': '3.243e-06', 'ppl': '1.692', 'memory/max_active (GiB)': '68.83', 'memory/max_allocated (GiB)': '68.83', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '63.67', 'tokens/total': 2489895424, 'tokens/trainable': 920747456, 'epoch': '2.248'}
 75%|█████████████████████████████████████████████████████████████████▏                     | 1313/1751 [21:56:28<7:04:24, 58.14s/it] 75%|█████████████████████████████████████████████████████████████████▎                     | 1314/1751 [21:57:28<7:07:32, 58.70s/it]                                                                                                                                     {'loss': '0.531', 'grad_norm': '0.1826', 'learning_rate': '3.229e-06', 'ppl': '1.701', 'memory/max_active (GiB)': '76.68', 'memory/max_allocated (GiB)': '76.68', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '55.71', 'tokens/total': 2491857408, 'tokens/trainable': 921479040, 'epoch': '2.25'}
 75%|█████████████████████████████████████████████████████████████████▎                     | 1314/1751 [21:57:28<7:07:32, 58.70s/it] 75%|█████████████████████████████████████████████████████████████████▎                     | 1315/1751 [21:58:27<7:07:17, 58.80s/it]                                                                                                                                     {'loss': '0.5187', 'grad_norm': '0.1816', 'learning_rate': '3.215e-06', 'ppl': '1.68', 'memory/max_active (GiB)': '72.19', 'memory/max_allocated (GiB)': '72.19', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '48.69', 'tokens/total': 2493735168, 'tokens/trainable': 922174336, 'epoch': '2.252'}
 75%|█████████████████████████████████████████████████████████████████▎                     | 1315/1751 [21:58:27<7:07:17, 58.80s/it] 75%|█████████████████████████████████████████████████████████████████▍                     | 1316/1751 [21:59:23<7:01:55, 58.20s/it]                                                                                                                                     {'loss': '0.5115', 'grad_norm': '0.1816', 'learning_rate': '3.201e-06', 'ppl': '1.668', 'memory/max_active (GiB)': '75.26', 'memory/max_allocated (GiB)': '75.26', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '129.6', 'tokens/total': 2495575552, 'tokens/trainable': 922852096, 'epoch': '2.254'}
 75%|█████████████████████████████████████████████████████████████████▍                     | 1316/1751 [21:59:23<7:01:55, 58.20s/it] 75%|█████████████████████████████████████████████████████████████████▍                     | 1317/1751 [22:00:22<7:02:47, 58.45s/it]                                                                                                                                     {'loss': '0.529', 'grad_norm': '0.1768', 'learning_rate': '3.187e-06', 'ppl': '1.697', 'memory/max_active (GiB)': '75.1', 'memory/max_allocated (GiB)': '75.1', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '116', 'tokens/total': 2497494016, 'tokens/trainable': 923533312, 'epoch': '2.255'}
 75%|█████████████████████████████████████████████████████████████████▍                     | 1317/1751 [22:00:22<7:02:47, 58.45s/it] 75%|█████████████████████████████████████████████████████████████████▍                     | 1318/1751 [22:01:20<6:58:54, 58.05s/it]                                                                                                                                     {'loss': '0.5101', 'grad_norm': '0.1904', 'learning_rate': '3.173e-06', 'ppl': '1.665', 'memory/max_active (GiB)': '76.1', 'memory/max_allocated (GiB)': '76.1', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '46.3', 'tokens/total': 2499313664, 'tokens/trainable': 924189632, 'epoch': '2.257'}
 75%|█████████████████████████████████████████████████████████████████▍                     | 1318/1751 [22:01:20<6:58:54, 58.05s/it] 75%|█████████████████████████████████████████████████████████████████▌                     | 1319/1751 [22:02:18<6:58:36, 58.14s/it]                                                                                                                                     {'loss': '0.5567', 'grad_norm': '0.1895', 'learning_rate': '3.159e-06', 'ppl': '1.745', 'memory/max_active (GiB)': '71.07', 'memory/max_allocated (GiB)': '71.07', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '188.8', 'tokens/total': 2501171456, 'tokens/trainable': 924884736, 'epoch': '2.259'}
 75%|█████████████████████████████████████████████████████████████████▌                     | 1319/1751 [22:02:18<6:58:36, 58.14s/it] 75%|█████████████████████████████████████████████████████████████████▌                     | 1320/1751 [22:03:19<7:04:17, 59.07s/it]                                                                                                                                     {'loss': '0.5052', 'grad_norm': '0.1826', 'learning_rate': '3.146e-06', 'ppl': '1.657', 'memory/max_active (GiB)': '75.2', 'memory/max_allocated (GiB)': '75.2', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '189.3', 'tokens/total': 2503132672, 'tokens/trainable': 925643136, 'epoch': '2.26'}
 75%|█████████████████████████████████████████████████████████████████▌                     | 1320/1751 [22:03:19<7:04:17, 59.07s/it] 75%|█████████████████████████████████████████████████████████████████▋                     | 1321/1751 [22:04:19<7:05:24, 59.36s/it]                                                                                                                                     {'loss': '0.5221', 'grad_norm': '0.1719', 'learning_rate': '3.132e-06', 'ppl': '1.686', 'memory/max_active (GiB)': '71.19', 'memory/max_allocated (GiB)': '71.19', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '64.56', 'tokens/total': 2505007360, 'tokens/trainable': 926348736, 'epoch': '2.262'}
 75%|█████████████████████████████████████████████████████████████████▋                     | 1321/1751 [22:04:19<7:05:24, 59.36s/it] 75%|█████████████████████████████████████████████████████████████████▋                     | 1322/1751 [22:05:20<7:08:34, 59.94s/it]                                                                                                                                     {'loss': '0.5068', 'grad_norm': '0.1836', 'learning_rate': '3.118e-06', 'ppl': '1.66', 'memory/max_active (GiB)': '74.24', 'memory/max_allocated (GiB)': '74.24', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '116.9', 'tokens/total': 2506979328, 'tokens/trainable': 927048512, 'epoch': '2.264'}
 75%|█████████████████████████████████████████████████████████████████▋                     | 1322/1751 [22:05:20<7:08:34, 59.94s/it] 76%|█████████████████████████████████████████████████████████████████▋                     | 1323/1751 [22:06:18<7:02:24, 59.22s/it]                                                                                                                                     {'loss': '0.5195', 'grad_norm': '0.1875', 'learning_rate': '3.105e-06', 'ppl': '1.681', 'memory/max_active (GiB)': '74.47', 'memory/max_allocated (GiB)': '74.47', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '119.9', 'tokens/total': 2508812800, 'tokens/trainable': 927702080, 'epoch': '2.266'}
 76%|█████████████████████████████████████████████████████████████████▋                     | 1323/1751 [22:06:18<7:02:24, 59.22s/it] 76%|█████████████████████████████████████████████████████████████████▊                     | 1324/1751 [22:07:17<7:01:55, 59.29s/it]                                                                                                                                     {'loss': '0.5084', 'grad_norm': '0.1787', 'learning_rate': '3.091e-06', 'ppl': '1.663', 'memory/max_active (GiB)': '75.41', 'memory/max_allocated (GiB)': '75.41', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '61.14', 'tokens/total': 2510692352, 'tokens/trainable': 928410176, 'epoch': '2.267'}
 76%|█████████████████████████████████████████████████████████████████▊                     | 1324/1751 [22:07:17<7:01:55, 59.29s/it] 76%|█████████████████████████████████████████████████████████████████▊                     | 1325/1751 [22:08:18<7:02:41, 59.53s/it]                                                                                                                                     {'loss': '0.5333', 'grad_norm': '0.1807', 'learning_rate': '3.077e-06', 'ppl': '1.705', 'memory/max_active (GiB)': '72.59', 'memory/max_allocated (GiB)': '72.59', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '31.2', 'tokens/total': 2512611072, 'tokens/trainable': 929129472, 'epoch': '2.269'}
 76%|█████████████████████████████████████████████████████████████████▊                     | 1325/1751 [22:08:18<7:02:41, 59.53s/it] 76%|█████████████████████████████████████████████████████████████████▉                     | 1326/1751 [22:09:16<6:59:03, 59.16s/it]                                                                                                                                     {'loss': '0.5287', 'grad_norm': '0.1904', 'learning_rate': '3.064e-06', 'ppl': '1.697', 'memory/max_active (GiB)': '75.03', 'memory/max_allocated (GiB)': '75.03', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '113.5', 'tokens/total': 2514399744, 'tokens/trainable': 929781248, 'epoch': '2.271'}
 76%|█████████████████████████████████████████████████████████████████▉                     | 1326/1751 [22:09:16<6:59:03, 59.16s/it] 76%|█████████████████████████████████████████████████████████████████▉                     | 1327/1751 [22:10:14<6:56:47, 58.98s/it]                                                                                                                                     {'loss': '0.5446', 'grad_norm': '0.1875', 'learning_rate': '3.05e-06', 'ppl': '1.724', 'memory/max_active (GiB)': '73.99', 'memory/max_allocated (GiB)': '73.99', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '61.23', 'tokens/total': 2516279040, 'tokens/trainable': 930464640, 'epoch': '2.272'}
 76%|█████████████████████████████████████████████████████████████████▉                     | 1327/1751 [22:10:14<6:56:47, 58.98s/it] 76%|█████████████████████████████████████████████████████████████████▉                     | 1328/1751 [22:11:14<6:56:20, 59.06s/it]                                                                                                                                     {'loss': '0.5505', 'grad_norm': '0.1836', 'learning_rate': '3.037e-06', 'ppl': '1.734', 'memory/max_active (GiB)': '72.11', 'memory/max_allocated (GiB)': '72.11', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '61.55', 'tokens/total': 2518141184, 'tokens/trainable': 931153728, 'epoch': '2.274'}
 76%|█████████████████████████████████████████████████████████████████▉                     | 1328/1751 [22:11:14<6:56:20, 59.06s/it] 76%|██████████████████████████████████████████████████████████████████                     | 1329/1751 [22:12:13<6:56:39, 59.24s/it]                                                                                                                                     {'loss': '0.4922', 'grad_norm': '0.1699', 'learning_rate': '3.023e-06', 'ppl': '1.636', 'memory/max_active (GiB)': '73.6', 'memory/max_allocated (GiB)': '73.6', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '90.82', 'tokens/total': 2520031744, 'tokens/trainable': 931869376, 'epoch': '2.276'}
 76%|██████████████████████████████████████████████████████████████████                     | 1329/1751 [22:12:13<6:56:39, 59.24s/it] 76%|██████████████████████████████████████████████████████████████████                     | 1330/1751 [22:13:15<7:00:44, 59.96s/it]                                                                                                                                     {'loss': '0.4846', 'grad_norm': '0.1709', 'learning_rate': '3.009e-06', 'ppl': '1.623', 'memory/max_active (GiB)': '76.36', 'memory/max_allocated (GiB)': '76.36', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '120.8', 'tokens/total': 2521997312, 'tokens/trainable': 932614912, 'epoch': '2.278'}
 76%|██████████████████████████████████████████████████████████████████                     | 1330/1751 [22:13:15<7:00:44, 59.96s/it] 76%|██████████████████████████████████████████████████████████████████▏                    | 1331/1751 [22:14:13<6:54:55, 59.27s/it]                                                                                                                                     {'loss': '0.5', 'grad_norm': '0.1865', 'learning_rate': '2.996e-06', 'ppl': '1.649', 'memory/max_active (GiB)': '74.58', 'memory/max_allocated (GiB)': '74.58', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '15.95', 'tokens/total': 2523810048, 'tokens/trainable': 933276480, 'epoch': '2.279'}
 76%|██████████████████████████████████████████████████████████████████▏                    | 1331/1751 [22:14:13<6:54:55, 59.27s/it] 76%|██████████████████████████████████████████████████████████████████▏                    | 1332/1751 [22:15:15<6:59:45, 60.11s/it]                                                                                                                                     {'loss': '0.5014', 'grad_norm': '0.1816', 'learning_rate': '2.983e-06', 'ppl': '1.651', 'memory/max_active (GiB)': '72.31', 'memory/max_allocated (GiB)': '72.31', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '90.97', 'tokens/total': 2525785856, 'tokens/trainable': 933987072, 'epoch': '2.281'}
 76%|██████████████████████████████████████████████████████████████████▏                    | 1332/1751 [22:15:15<6:59:45, 60.11s/it] 76%|██████████████████████████████████████████████████████████████████▏                    | 1333/1751 [22:16:13<6:55:21, 59.62s/it]                                                                                                                                     {'loss': '0.5318', 'grad_norm': '0.1846', 'learning_rate': '2.969e-06', 'ppl': '1.702', 'memory/max_active (GiB)': '73.25', 'memory/max_allocated (GiB)': '73.25', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '104', 'tokens/total': 2527659776, 'tokens/trainable': 934675712, 'epoch': '2.283'}
 76%|██████████████████████████████████████████████████████████████████▏                    | 1333/1751 [22:16:13<6:55:21, 59.62s/it] 76%|██████████████████████████████████████████████████████████████████▎                    | 1334/1751 [22:17:14<6:56:52, 59.98s/it]                                                                                                                                     {'loss': '0.5362', 'grad_norm': '0.1846', 'learning_rate': '2.956e-06', 'ppl': '1.709', 'memory/max_active (GiB)': '65.63', 'memory/max_allocated (GiB)': '65.63', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '25.6', 'tokens/total': 2529614592, 'tokens/trainable': 935404096, 'epoch': '2.284'}
 76%|██████████████████████████████████████████████████████████████████▎                    | 1334/1751 [22:17:14<6:56:52, 59.98s/it] 76%|██████████████████████████████████████████████████████████████████▎                    | 1335/1751 [22:18:14<6:55:27, 59.92s/it]                                                                                                                                     {'loss': '0.5619', 'grad_norm': '0.1846', 'learning_rate': '2.942e-06', 'ppl': '1.754', 'memory/max_active (GiB)': '68.74', 'memory/max_allocated (GiB)': '68.74', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '77.12', 'tokens/total': 2531502080, 'tokens/trainable': 936108480, 'epoch': '2.286'}
 76%|██████████████████████████████████████████████████████████████████▎                    | 1335/1751 [22:18:14<6:55:27, 59.92s/it] 76%|██████████████████████████████████████████████████████████████████▍                    | 1336/1751 [22:19:12<6:51:52, 59.55s/it]                                                                                                                                     {'loss': '0.5219', 'grad_norm': '0.1826', 'learning_rate': '2.929e-06', 'ppl': '1.685', 'memory/max_active (GiB)': '71.79', 'memory/max_allocated (GiB)': '71.79', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '76.76', 'tokens/total': 2533388288, 'tokens/trainable': 936794944, 'epoch': '2.288'}
 76%|██████████████████████████████████████████████████████████████████▍                    | 1336/1751 [22:19:12<6:51:52, 59.55s/it] 76%|██████████████████████████████████████████████████████████████████▍                    | 1337/1751 [22:20:12<6:51:51, 59.69s/it]                                                                                                                                     {'loss': '0.5188', 'grad_norm': '0.1758', 'learning_rate': '2.916e-06', 'ppl': '1.68', 'memory/max_active (GiB)': '69.07', 'memory/max_allocated (GiB)': '69.07', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '83.86', 'tokens/total': 2535328256, 'tokens/trainable': 937519232, 'epoch': '2.29'}
 76%|██████████████████████████████████████████████████████████████████▍                    | 1337/1751 [22:20:12<6:51:51, 59.69s/it] 76%|██████████████████████████████████████████████████████████████████▍                    | 1338/1751 [22:21:13<6:53:09, 60.02s/it]                                                                                                                                     {'loss': '0.4935', 'grad_norm': '0.1719', 'learning_rate': '2.902e-06', 'ppl': '1.638', 'memory/max_active (GiB)': '76.46', 'memory/max_allocated (GiB)': '76.46', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '68.66', 'tokens/total': 2537286400, 'tokens/trainable': 938218944, 'epoch': '2.291'}
 76%|██████████████████████████████████████████████████████████████████▍                    | 1338/1751 [22:21:13<6:53:09, 60.02s/it] 76%|██████████████████████████████████████████████████████████████████▌                    | 1339/1751 [22:22:10<6:46:13, 59.16s/it]                                                                                                                                     {'loss': '0.5199', 'grad_norm': '0.1855', 'learning_rate': '2.889e-06', 'ppl': '1.682', 'memory/max_active (GiB)': '73.26', 'memory/max_allocated (GiB)': '73.26', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '146.3', 'tokens/total': 2539082752, 'tokens/trainable': 938892224, 'epoch': '2.293'}
 76%|██████████████████████████████████████████████████████████████████▌                    | 1339/1751 [22:22:10<6:46:13, 59.16s/it] 77%|██████████████████████████████████████████████████████████████████▌                    | 1340/1751 [22:23:10<6:45:16, 59.16s/it]                                                                                                                                     {'loss': '0.4909', 'grad_norm': '0.1797', 'learning_rate': '2.876e-06', 'ppl': '1.634', 'memory/max_active (GiB)': '71.54', 'memory/max_allocated (GiB)': '71.54', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '98.72', 'tokens/total': 2540948992, 'tokens/trainable': 939565248, 'epoch': '2.295'}
 77%|██████████████████████████████████████████████████████████████████▌                    | 1340/1751 [22:23:10<6:45:16, 59.16s/it] 77%|██████████████████████████████████████████████████████████████████▋                    | 1341/1751 [22:24:10<6:46:20, 59.47s/it]                                                                                                                                     {'loss': '0.4993', 'grad_norm': '0.1689', 'learning_rate': '2.862e-06', 'ppl': '1.648', 'memory/max_active (GiB)': '75.9', 'memory/max_allocated (GiB)': '75.9', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '73.18', 'tokens/total': 2542883584, 'tokens/trainable': 940295936, 'epoch': '2.296'}
 77%|██████████████████████████████████████████████████████████████████▋                    | 1341/1751 [22:24:10<6:46:20, 59.47s/it] 77%|██████████████████████████████████████████████████████████████████▋                    | 1342/1751 [22:25:12<6:51:12, 60.32s/it]                                                                                                                                     {'loss': '0.5223', 'grad_norm': '0.1719', 'learning_rate': '2.849e-06', 'ppl': '1.686', 'memory/max_active (GiB)': '76.98', 'memory/max_allocated (GiB)': '76.98', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '82.12', 'tokens/total': 2544867072, 'tokens/trainable': 941024704, 'epoch': '2.298'}
 77%|██████████████████████████████████████████████████████████████████▋                    | 1342/1751 [22:25:12<6:51:12, 60.32s/it] 77%|██████████████████████████████████████████████████████████████████▋                    | 1343/1751 [22:26:14<6:54:15, 60.92s/it]                                                                                                                                     {'loss': '0.5077', 'grad_norm': '0.1709', 'learning_rate': '2.836e-06', 'ppl': '1.661', 'memory/max_active (GiB)': '74.76', 'memory/max_allocated (GiB)': '74.76', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '226', 'tokens/total': 2546846720, 'tokens/trainable': 941761408, 'epoch': '2.3'}
 77%|██████████████████████████████████████████████████████████████████▋                    | 1343/1751 [22:26:14<6:54:15, 60.92s/it] 77%|██████████████████████████████████████████████████████████████████▊                    | 1344/1751 [22:27:16<6:54:19, 61.08s/it]                                                                                                                                     {'loss': '0.5388', 'grad_norm': '0.1777', 'learning_rate': '2.823e-06', 'ppl': '1.714', 'memory/max_active (GiB)': '76.25', 'memory/max_allocated (GiB)': '76.25', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '121.4', 'tokens/total': 2548789760, 'tokens/trainable': 942477312, 'epoch': '2.302'}
 77%|██████████████████████████████████████████████████████████████████▊                    | 1344/1751 [22:27:16<6:54:19, 61.08s/it] 77%|██████████████████████████████████████████████████████████████████▊                    | 1345/1751 [22:28:17<6:53:13, 61.07s/it]                                                                                                                                     {'loss': '0.5299', 'grad_norm': '0.1865', 'learning_rate': '2.81e-06', 'ppl': '1.699', 'memory/max_active (GiB)': '74.9', 'memory/max_allocated (GiB)': '74.9', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '66.41', 'tokens/total': 2550783744, 'tokens/trainable': 943187008, 'epoch': '2.303'}
 77%|██████████████████████████████████████████████████████████████████▊                    | 1345/1751 [22:28:17<6:53:13, 61.07s/it] 77%|██████████████████████████████████████████████████████████████████▉                    | 1346/1751 [22:29:14<6:45:13, 60.03s/it]                                                                                                                                     {'loss': '0.5186', 'grad_norm': '0.1865', 'learning_rate': '2.797e-06', 'ppl': '1.68', 'memory/max_active (GiB)': '72.78', 'memory/max_allocated (GiB)': '72.78', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '133.5', 'tokens/total': 2552599808, 'tokens/trainable': 943884480, 'epoch': '2.305'}
 77%|██████████████████████████████████████████████████████████████████▉                    | 1346/1751 [22:29:14<6:45:13, 60.03s/it] 77%|██████████████████████████████████████████████████████████████████▉                    | 1347/1751 [22:30:16<6:47:56, 60.59s/it]                                                                                                                                     {'loss': '0.489', 'grad_norm': '0.1631', 'learning_rate': '2.784e-06', 'ppl': '1.631', 'memory/max_active (GiB)': '73.73', 'memory/max_allocated (GiB)': '73.73', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '45.56', 'tokens/total': 2554597376, 'tokens/trainable': 944634176, 'epoch': '2.307'}
 77%|██████████████████████████████████████████████████████████████████▉                    | 1347/1751 [22:30:16<6:47:56, 60.59s/it] 77%|██████████████████████████████████████████████████████████████████▉                    | 1348/1751 [22:31:16<6:44:55, 60.29s/it]                                                                                                                                     {'loss': '0.5042', 'grad_norm': '0.1709', 'learning_rate': '2.771e-06', 'ppl': '1.656', 'memory/max_active (GiB)': '76.41', 'memory/max_allocated (GiB)': '76.41', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '94.79', 'tokens/total': 2556514560, 'tokens/trainable': 945333440, 'epoch': '2.308'}
 77%|██████████████████████████████████████████████████████████████████▉                    | 1348/1751 [22:31:16<6:44:55, 60.29s/it] 77%|███████████████████████████████████████████████████████████████████                    | 1349/1751 [22:32:14<6:40:09, 59.73s/it]                                                                                                                                     {'loss': '0.5285', 'grad_norm': '0.1758', 'learning_rate': '2.758e-06', 'ppl': '1.696', 'memory/max_active (GiB)': '70.34', 'memory/max_allocated (GiB)': '70.34', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '137.4', 'tokens/total': 2558385152, 'tokens/trainable': 946029952, 'epoch': '2.31'}
 77%|███████████████████████████████████████████████████████████████████                    | 1349/1751 [22:32:14<6:40:09, 59.73s/it] 77%|███████████████████████████████████████████████████████████████████                    | 1350/1751 [22:33:13<6:36:11, 59.28s/it]                                                                                                                                     {'loss': '0.4888', 'grad_norm': '0.2021', 'learning_rate': '2.745e-06', 'ppl': '1.63', 'memory/max_active (GiB)': '74.81', 'memory/max_allocated (GiB)': '74.81', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '160.2', 'tokens/total': 2560238592, 'tokens/trainable': 946689280, 'epoch': '2.312'}
 77%|███████████████████████████████████████████████████████████████████                    | 1350/1751 [22:33:13<6:36:11, 59.28s/it] 77%|███████████████████████████████████████████████████████████████████▏                   | 1351/1751 [22:34:12<6:35:17, 59.29s/it]                                                                                                                                     {'loss': '0.4989', 'grad_norm': '0.1748', 'learning_rate': '2.732e-06', 'ppl': '1.647', 'memory/max_active (GiB)': '74.34', 'memory/max_allocated (GiB)': '74.34', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '82.58', 'tokens/total': 2562156544, 'tokens/trainable': 947397504, 'epoch': '2.314'}
 77%|███████████████████████████████████████████████████████████████████▏                   | 1351/1751 [22:34:12<6:35:17, 59.29s/it] 77%|███████████████████████████████████████████████████████████████████▏                   | 1352/1751 [22:35:10<6:32:38, 59.04s/it]                                                                                                                                     {'loss': '0.5581', 'grad_norm': '0.1982', 'learning_rate': '2.719e-06', 'ppl': '1.747', 'memory/max_active (GiB)': '75.63', 'memory/max_allocated (GiB)': '75.63', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '124.5', 'tokens/total': 2564009984, 'tokens/trainable': 948061568, 'epoch': '2.315'}
 77%|███████████████████████████████████████████████████████████████████▏                   | 1352/1751 [22:35:10<6:32:38, 59.04s/it] 77%|███████████████████████████████████████████████████████████████████▏                   | 1353/1751 [22:36:12<6:37:12, 59.88s/it]                                                                                                                                     {'loss': '0.5125', 'grad_norm': '0.1777', 'learning_rate': '2.706e-06', 'ppl': '1.669', 'memory/max_active (GiB)': '72.73', 'memory/max_allocated (GiB)': '72.73', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '100.8', 'tokens/total': 2565980416, 'tokens/trainable': 948809536, 'epoch': '2.317'}
 77%|███████████████████████████████████████████████████████████████████▏                   | 1353/1751 [22:36:12<6:37:12, 59.88s/it] 77%|███████████████████████████████████████████████████████████████████▎                   | 1354/1751 [22:37:16<6:44:22, 61.12s/it]                                                                                                                                     {'loss': '0.514', 'grad_norm': '0.1689', 'learning_rate': '2.693e-06', 'ppl': '1.672', 'memory/max_active (GiB)': '75.65', 'memory/max_allocated (GiB)': '75.65', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '72.87', 'tokens/total': 2568064000, 'tokens/trainable': 949572608, 'epoch': '2.319'}
 77%|███████████████████████████████████████████████████████████████████▎                   | 1354/1751 [22:37:16<6:44:22, 61.12s/it] 77%|███████████████████████████████████████████████████████████████████▎                   | 1355/1751 [22:38:18<6:44:49, 61.34s/it]                                                                                                                                     {'loss': '0.5023', 'grad_norm': '0.1738', 'learning_rate': '2.68e-06', 'ppl': '1.653', 'memory/max_active (GiB)': '76.45', 'memory/max_allocated (GiB)': '76.45', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '101.9', 'tokens/total': 2570015488, 'tokens/trainable': 950312064, 'epoch': '2.32'}
 77%|███████████████████████████████████████████████████████████████████▎                   | 1355/1751 [22:38:18<6:44:49, 61.34s/it] 77%|███████████████████████████████████████████████████████████████████▎                   | 1356/1751 [22:39:18<6:41:27, 60.98s/it]                                                                                                                                     {'loss': '0.5364', 'grad_norm': '0.1807', 'learning_rate': '2.667e-06', 'ppl': '1.71', 'memory/max_active (GiB)': '74.51', 'memory/max_allocated (GiB)': '74.51', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '70.25', 'tokens/total': 2571844608, 'tokens/trainable': 951008384, 'epoch': '2.322'}
 77%|███████████████████████████████████████████████████████████████████▎                   | 1356/1751 [22:39:18<6:41:27, 60.98s/it] 77%|███████████████████████████████████████████████████████████████████▍                   | 1357/1751 [22:40:19<6:40:00, 60.92s/it]                                                                                                                                     {'loss': '0.5025', 'grad_norm': '0.1758', 'learning_rate': '2.654e-06', 'ppl': '1.653', 'memory/max_active (GiB)': '74.72', 'memory/max_allocated (GiB)': '74.72', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '77.31', 'tokens/total': 2573774592, 'tokens/trainable': 951720960, 'epoch': '2.324'}
 77%|███████████████████████████████████████████████████████████████████▍                   | 1357/1751 [22:40:19<6:40:00, 60.92s/it] 78%|███████████████████████████████████████████████████████████████████▍                   | 1358/1751 [22:41:20<6:38:51, 60.89s/it]                                                                                                                                     {'loss': '0.5327', 'grad_norm': '0.1787', 'learning_rate': '2.641e-06', 'ppl': '1.703', 'memory/max_active (GiB)': '75.36', 'memory/max_allocated (GiB)': '75.36', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '74.16', 'tokens/total': 2575699200, 'tokens/trainable': 952446080, 'epoch': '2.326'}
 78%|███████████████████████████████████████████████████████████████████▍                   | 1358/1751 [22:41:20<6:38:51, 60.89s/it] 78%|███████████████████████████████████████████████████████████████████▌                   | 1359/1751 [22:42:20<6:37:21, 60.82s/it]                                                                                                                                     {'loss': '0.5115', 'grad_norm': '0.1748', 'learning_rate': '2.629e-06', 'ppl': '1.668', 'memory/max_active (GiB)': '69', 'memory/max_allocated (GiB)': '69', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '60.2', 'tokens/total': 2577603328, 'tokens/trainable': 953153920, 'epoch': '2.327'}
 78%|███████████████████████████████████████████████████████████████████▌                   | 1359/1751 [22:42:20<6:37:21, 60.82s/it] 78%|███████████████████████████████████████████████████████████████████▌                   | 1360/1751 [22:43:21<6:35:57, 60.76s/it]                                                                                                                                     {'loss': '0.5217', 'grad_norm': '0.1748', 'learning_rate': '2.616e-06', 'ppl': '1.685', 'memory/max_active (GiB)': '70.15', 'memory/max_allocated (GiB)': '70.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '160.4', 'tokens/total': 2579428608, 'tokens/trainable': 953814336, 'epoch': '2.329'}
 78%|███████████████████████████████████████████████████████████████████▌                   | 1360/1751 [22:43:21<6:35:57, 60.76s/it] 78%|███████████████████████████████████████████████████████████████████▌                   | 1361/1751 [22:44:22<6:35:01, 60.77s/it]                                                                                                                                     {'loss': '0.5496', 'grad_norm': '0.1797', 'learning_rate': '2.603e-06', 'ppl': '1.733', 'memory/max_active (GiB)': '70.48', 'memory/max_allocated (GiB)': '70.48', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '101.3', 'tokens/total': 2581317632, 'tokens/trainable': 954531840, 'epoch': '2.331'}
 78%|███████████████████████████████████████████████████████████████████▌                   | 1361/1751 [22:44:22<6:35:01, 60.77s/it] 78%|███████████████████████████████████████████████████████████████████▋                   | 1362/1751 [22:45:21<6:31:43, 60.42s/it]                                                                                                                                     {'loss': '0.5423', 'grad_norm': '0.1777', 'learning_rate': '2.59e-06', 'ppl': '1.72', 'memory/max_active (GiB)': '70.84', 'memory/max_allocated (GiB)': '70.84', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '200.8', 'tokens/total': 2583191552, 'tokens/trainable': 955257024, 'epoch': '2.332'}
 78%|███████████████████████████████████████████████████████████████████▋                   | 1362/1751 [22:45:21<6:31:43, 60.42s/it] 78%|███████████████████████████████████████████████████████████████████▋                   | 1363/1751 [22:46:21<6:28:05, 60.01s/it]                                                                                                                                     {'loss': '0.5211', 'grad_norm': '0.1748', 'learning_rate': '2.578e-06', 'ppl': '1.684', 'memory/max_active (GiB)': '69.38', 'memory/max_allocated (GiB)': '69.38', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '71.32', 'tokens/total': 2585077248, 'tokens/trainable': 955953984, 'epoch': '2.334'}
 78%|███████████████████████████████████████████████████████████████████▋                   | 1363/1751 [22:46:21<6:28:05, 60.01s/it] 78%|███████████████████████████████████████████████████████████████████▊                   | 1364/1751 [22:47:18<6:21:32, 59.15s/it]                                                                                                                                     {'loss': '0.5482', 'grad_norm': '0.1875', 'learning_rate': '2.565e-06', 'ppl': '1.73', 'memory/max_active (GiB)': '75.05', 'memory/max_allocated (GiB)': '75.05', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '108.2', 'tokens/total': 2586822912, 'tokens/trainable': 956569472, 'epoch': '2.336'}
 78%|███████████████████████████████████████████████████████████████████▊                   | 1364/1751 [22:47:18<6:21:32, 59.15s/it] 78%|███████████████████████████████████████████████████████████████████▊                   | 1365/1751 [22:48:17<6:21:46, 59.34s/it]                                                                                                                                     {'loss': '0.5062', 'grad_norm': '0.1709', 'learning_rate': '2.553e-06', 'ppl': '1.659', 'memory/max_active (GiB)': '75.42', 'memory/max_allocated (GiB)': '75.42', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '115.2', 'tokens/total': 2588738816, 'tokens/trainable': 957272768, 'epoch': '2.338'}
 78%|███████████████████████████████████████████████████████████████████▊                   | 1365/1751 [22:48:17<6:21:46, 59.34s/it] 78%|███████████████████████████████████████████████████████████████████▊                   | 1366/1751 [22:49:17<6:21:30, 59.45s/it]                                                                                                                                     {'loss': '0.5169', 'grad_norm': '0.1797', 'learning_rate': '2.54e-06', 'ppl': '1.677', 'memory/max_active (GiB)': '74.21', 'memory/max_allocated (GiB)': '74.21', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '139.4', 'tokens/total': 2590566912, 'tokens/trainable': 957934912, 'epoch': '2.339'}
 78%|███████████████████████████████████████████████████████████████████▊                   | 1366/1751 [22:49:17<6:21:30, 59.45s/it] 78%|███████████████████████████████████████████████████████████████████▉                   | 1367/1751 [22:50:15<6:17:08, 58.93s/it]                                                                                                                                     {'loss': '0.5351', 'grad_norm': '0.1963', 'learning_rate': '2.527e-06', 'ppl': '1.708', 'memory/max_active (GiB)': '74.46', 'memory/max_allocated (GiB)': '74.46', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '114', 'tokens/total': 2592374016, 'tokens/trainable': 958569920, 'epoch': '2.341'}
 78%|███████████████████████████████████████████████████████████████████▉                   | 1367/1751 [22:50:15<6:17:08, 58.93s/it] 78%|███████████████████████████████████████████████████████████████████▉                   | 1368/1751 [22:51:15<6:18:35, 59.31s/it]                                                                                                                                     {'loss': '0.5092', 'grad_norm': '0.1777', 'learning_rate': '2.515e-06', 'ppl': '1.664', 'memory/max_active (GiB)': '76.11', 'memory/max_allocated (GiB)': '76.11', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '88.99', 'tokens/total': 2594274048, 'tokens/trainable': 959276096, 'epoch': '2.343'}
 78%|███████████████████████████████████████████████████████████████████▉                   | 1368/1751 [22:51:15<6:18:35, 59.31s/it] 78%|████████████████████████████████████████████████████████████████████                   | 1369/1751 [22:52:14<6:16:58, 59.21s/it]                                                                                                                                     {'loss': '0.5563', 'grad_norm': '0.1846', 'learning_rate': '2.502e-06', 'ppl': '1.744', 'memory/max_active (GiB)': '76.23', 'memory/max_allocated (GiB)': '76.23', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '122.2', 'tokens/total': 2596114432, 'tokens/trainable': 959966592, 'epoch': '2.344'}
 78%|████████████████████████████████████████████████████████████████████                   | 1369/1751 [22:52:14<6:16:58, 59.21s/it] 78%|████████████████████████████████████████████████████████████████████                   | 1370/1751 [22:53:15<6:19:20, 59.74s/it]                                                                                                                                     {'loss': '0.5095', 'grad_norm': '0.1738', 'learning_rate': '2.49e-06', 'ppl': '1.664', 'memory/max_active (GiB)': '74.67', 'memory/max_allocated (GiB)': '74.67', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '55.38', 'tokens/total': 2598072832, 'tokens/trainable': 960673472, 'epoch': '2.346'}
 78%|████████████████████████████████████████████████████████████████████                   | 1370/1751 [22:53:15<6:19:20, 59.74s/it] 78%|████████████████████████████████████████████████████████████████████                   | 1371/1751 [22:54:15<6:18:52, 59.82s/it]                                                                                                                                     {'loss': '0.5017', 'grad_norm': '0.1689', 'learning_rate': '2.477e-06', 'ppl': '1.652', 'memory/max_active (GiB)': '76.51', 'memory/max_allocated (GiB)': '76.51', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '95.21', 'tokens/total': 2600000768, 'tokens/trainable': 961433024, 'epoch': '2.348'}
 78%|████████████████████████████████████████████████████████████████████                   | 1371/1751 [22:54:15<6:18:52, 59.82s/it] 78%|████████████████████████████████████████████████████████████████████▏                  | 1372/1751 [22:55:15<6:17:31, 59.77s/it]                                                                                                                                     {'loss': '0.5037', 'grad_norm': '0.1875', 'learning_rate': '2.465e-06', 'ppl': '1.655', 'memory/max_active (GiB)': '75.96', 'memory/max_allocated (GiB)': '75.96', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '49', 'tokens/total': 2601883648, 'tokens/trainable': 962119104, 'epoch': '2.35'}
 78%|████████████████████████████████████████████████████████████████████▏                  | 1372/1751 [22:55:15<6:17:31, 59.77s/it] 78%|████████████████████████████████████████████████████████████████████▏                  | 1373/1751 [22:56:17<6:20:59, 60.47s/it]                                                                                                                                     {'loss': '0.5063', 'grad_norm': '0.1729', 'learning_rate': '2.453e-06', 'ppl': '1.659', 'memory/max_active (GiB)': '75.06', 'memory/max_allocated (GiB)': '75.06', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '99.39', 'tokens/total': 2603858176, 'tokens/trainable': 962870528, 'epoch': '2.351'}
 78%|████████████████████████████████████████████████████████████████████▏                  | 1373/1751 [22:56:17<6:20:59, 60.47s/it] 78%|████████████████████████████████████████████████████████████████████▎                  | 1374/1751 [22:57:16<6:17:03, 60.01s/it]                                                                                                                                     {'loss': '0.5394', 'grad_norm': '0.1777', 'learning_rate': '2.44e-06', 'ppl': '1.715', 'memory/max_active (GiB)': '73.31', 'memory/max_allocated (GiB)': '73.31', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '93.84', 'tokens/total': 2605719552, 'tokens/trainable': 963575872, 'epoch': '2.353'}
 78%|████████████████████████████████████████████████████████████████████▎                  | 1374/1751 [22:57:16<6:17:03, 60.01s/it] 79%|████████████████████████████████████████████████████████████████████▎                  | 1375/1751 [22:58:15<6:15:17, 59.89s/it]                                                                                                                                     {'loss': '0.518', 'grad_norm': '0.1738', 'learning_rate': '2.428e-06', 'ppl': '1.679', 'memory/max_active (GiB)': '76.95', 'memory/max_allocated (GiB)': '76.95', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '60.14', 'tokens/total': 2607615232, 'tokens/trainable': 964274304, 'epoch': '2.355'}
 79%|████████████████████████████████████████████████████████████████████▎                  | 1375/1751 [22:58:15<6:15:17, 59.89s/it] 79%|████████████████████████████████████████████████████████████████████▎                  | 1376/1751 [22:59:17<6:16:49, 60.29s/it]                                                                                                                                     {'loss': '0.5607', 'grad_norm': '0.2021', 'learning_rate': '2.416e-06', 'ppl': '1.752', 'memory/max_active (GiB)': '72.37', 'memory/max_allocated (GiB)': '72.37', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '52.98', 'tokens/total': 2609553152, 'tokens/trainable': 964957120, 'epoch': '2.356'}
 79%|████████████████████████████████████████████████████████████████████▎                  | 1376/1751 [22:59:17<6:16:49, 60.29s/it] 79%|████████████████████████████████████████████████████████████████████▍                  | 1377/1751 [23:00:19<6:18:51, 60.78s/it]                                                                                                                                     {'loss': '0.5101', 'grad_norm': '0.1875', 'learning_rate': '2.403e-06', 'ppl': '1.666', 'memory/max_active (GiB)': '75.11', 'memory/max_allocated (GiB)': '75.11', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '85.01', 'tokens/total': 2611565056, 'tokens/trainable': 965686144, 'epoch': '2.358'}
 79%|████████████████████████████████████████████████████████████████████▍                  | 1377/1751 [23:00:19<6:18:51, 60.78s/it] 79%|████████████████████████████████████████████████████████████████████▍                  | 1378/1751 [23:01:17<6:13:27, 60.07s/it]                                                                                                                                     {'loss': '0.5153', 'grad_norm': '0.1836', 'learning_rate': '2.391e-06', 'ppl': '1.674', 'memory/max_active (GiB)': '68.58', 'memory/max_allocated (GiB)': '68.58', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '63.77', 'tokens/total': 2613367040, 'tokens/trainable': 966381696, 'epoch': '2.36'}
 79%|████████████████████████████████████████████████████████████████████▍                  | 1378/1751 [23:01:17<6:13:27, 60.07s/it] 79%|████████████████████████████████████████████████████████████████████▌                  | 1379/1751 [23:02:17<6:12:16, 60.04s/it]                                                                                                                                     {'loss': '0.4852', 'grad_norm': '0.1768', 'learning_rate': '2.379e-06', 'ppl': '1.624', 'memory/max_active (GiB)': '69.92', 'memory/max_allocated (GiB)': '69.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '163.8', 'tokens/total': 2615286016, 'tokens/trainable': 967070656, 'epoch': '2.362'}
 79%|████████████████████████████████████████████████████████████████████▌                  | 1379/1751 [23:02:17<6:12:16, 60.04s/it] 79%|████████████████████████████████████████████████████████████████████▌                  | 1380/1751 [23:03:16<6:10:24, 59.90s/it]                                                                                                                                     {'loss': '0.5223', 'grad_norm': '0.1738', 'learning_rate': '2.367e-06', 'ppl': '1.686', 'memory/max_active (GiB)': '73.81', 'memory/max_allocated (GiB)': '73.81', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '71.03', 'tokens/total': 2617167104, 'tokens/trainable': 967780608, 'epoch': '2.363'}
 79%|████████████████████████████████████████████████████████████████████▌                  | 1380/1751 [23:03:16<6:10:24, 59.90s/it] 79%|████████████████████████████████████████████████████████████████████▌                  | 1381/1751 [23:04:17<6:11:09, 60.19s/it]                                                                                                                                     {'loss': '0.4872', 'grad_norm': '0.1729', 'learning_rate': '2.354e-06', 'ppl': '1.628', 'memory/max_active (GiB)': '75', 'memory/max_allocated (GiB)': '75', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '62.47', 'tokens/total': 2619084544, 'tokens/trainable': 968473600, 'epoch': '2.365'}
 79%|████████████████████████████████████████████████████████████████████▌                  | 1381/1751 [23:04:17<6:11:09, 60.19s/it] 79%|████████████████████████████████████████████████████████████████████▋                  | 1382/1751 [23:05:16<6:07:12, 59.71s/it]                                                                                                                                     {'loss': '0.5257', 'grad_norm': '0.1777', 'learning_rate': '2.342e-06', 'ppl': '1.692', 'memory/max_active (GiB)': '73.49', 'memory/max_allocated (GiB)': '73.49', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '73.1', 'tokens/total': 2620932608, 'tokens/trainable': 969140480, 'epoch': '2.367'}
 79%|████████████████████████████████████████████████████████████████████▋                  | 1382/1751 [23:05:16<6:07:12, 59.71s/it] 79%|████████████████████████████████████████████████████████████████████▋                  | 1383/1751 [23:06:16<6:06:16, 59.72s/it]                                                                                                                                     {'loss': '0.5259', 'grad_norm': '0.1855', 'learning_rate': '2.33e-06', 'ppl': '1.692', 'memory/max_active (GiB)': '75.61', 'memory/max_allocated (GiB)': '75.61', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '49.45', 'tokens/total': 2622786560, 'tokens/trainable': 969833536, 'epoch': '2.368'}
 79%|████████████████████████████████████████████████████████████████████▋                  | 1383/1751 [23:06:16<6:06:16, 59.72s/it] 79%|████████████████████████████████████████████████████████████████████▊                  | 1384/1751 [23:07:15<6:04:11, 59.54s/it]                                                                                                                                     {'loss': '0.523', 'grad_norm': '0.1836', 'learning_rate': '2.318e-06', 'ppl': '1.687', 'memory/max_active (GiB)': '68.21', 'memory/max_allocated (GiB)': '68.21', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '110.6', 'tokens/total': 2624659712, 'tokens/trainable': 970493504, 'epoch': '2.37'}
 79%|████████████████████████████████████████████████████████████████████▊                  | 1384/1751 [23:07:15<6:04:11, 59.54s/it] 79%|████████████████████████████████████████████████████████████████████▊                  | 1385/1751 [23:08:17<6:08:30, 60.41s/it]                                                                                                                                     {'loss': '0.5006', 'grad_norm': '0.166', 'learning_rate': '2.306e-06', 'ppl': '1.65', 'memory/max_active (GiB)': '75.14', 'memory/max_allocated (GiB)': '75.14', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '47.2', 'tokens/total': 2626638848, 'tokens/trainable': 971235008, 'epoch': '2.372'}
 79%|████████████████████████████████████████████████████████████████████▊                  | 1385/1751 [23:08:17<6:08:30, 60.41s/it] 79%|████████████████████████████████████████████████████████████████████▊                  | 1386/1751 [23:09:17<6:05:52, 60.14s/it]                                                                                                                                     {'loss': '0.5218', 'grad_norm': '0.1846', 'learning_rate': '2.294e-06', 'ppl': '1.685', 'memory/max_active (GiB)': '77.26', 'memory/max_allocated (GiB)': '77.26', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '95.11', 'tokens/total': 2628491008, 'tokens/trainable': 971929280, 'epoch': '2.374'}
 79%|████████████████████████████████████████████████████████████████████▊                  | 1386/1751 [23:09:17<6:05:52, 60.14s/it] 79%|████████████████████████████████████████████████████████████████████▉                  | 1387/1751 [23:10:16<6:04:07, 60.02s/it]                                                                                                                                     {'loss': '0.5426', 'grad_norm': '0.1865', 'learning_rate': '2.282e-06', 'ppl': '1.72', 'memory/max_active (GiB)': '76.73', 'memory/max_allocated (GiB)': '76.73', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '50.35', 'tokens/total': 2630339840, 'tokens/trainable': 972605376, 'epoch': '2.375'}
 79%|████████████████████████████████████████████████████████████████████▉                  | 1387/1751 [23:10:16<6:04:07, 60.02s/it] 79%|████████████████████████████████████████████████████████████████████▉                  | 1388/1751 [23:11:16<6:01:55, 59.82s/it]                                                                                                                                     {'loss': '0.5388', 'grad_norm': '0.1846', 'learning_rate': '2.27e-06', 'ppl': '1.714', 'memory/max_active (GiB)': '75.58', 'memory/max_allocated (GiB)': '75.58', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '108.5', 'tokens/total': 2632201728, 'tokens/trainable': 973285952, 'epoch': '2.377'}
 79%|████████████████████████████████████████████████████████████████████▉                  | 1388/1751 [23:11:16<6:01:55, 59.82s/it] 79%|█████████████████████████████████████████████████████████████████████                  | 1389/1751 [23:12:16<6:00:50, 59.81s/it]                                                                                                                                     {'loss': '0.5152', 'grad_norm': '0.1826', 'learning_rate': '2.258e-06', 'ppl': '1.674', 'memory/max_active (GiB)': '68.13', 'memory/max_allocated (GiB)': '68.13', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '150.8', 'tokens/total': 2634072320, 'tokens/trainable': 973975104, 'epoch': '2.379'}
 79%|█████████████████████████████████████████████████████████████████████                  | 1389/1751 [23:12:16<6:00:50, 59.81s/it] 79%|█████████████████████████████████████████████████████████████████████                  | 1390/1751 [23:13:17<6:02:47, 60.30s/it]                                                                                                                                     {'loss': '0.5461', 'grad_norm': '0.1846', 'learning_rate': '2.246e-06', 'ppl': '1.726', 'memory/max_active (GiB)': '76.49', 'memory/max_allocated (GiB)': '76.49', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '86.54', 'tokens/total': 2636014848, 'tokens/trainable': 974732928, 'epoch': '2.38'}
 79%|█████████████████████████████████████████████████████████████████████                  | 1390/1751 [23:13:17<6:02:47, 60.30s/it] 79%|█████████████████████████████████████████████████████████████████████                  | 1391/1751 [23:14:19<6:05:34, 60.93s/it]                                                                                                                                     {'loss': '0.4859', 'grad_norm': '0.1621', 'learning_rate': '2.234e-06', 'ppl': '1.626', 'memory/max_active (GiB)': '75.4', 'memory/max_allocated (GiB)': '75.4', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '154.9', 'tokens/total': 2637978368, 'tokens/trainable': 975498432, 'epoch': '2.382'}
 79%|█████████████████████████████████████████████████████████████████████                  | 1391/1751 [23:14:19<6:05:34, 60.93s/it] 79%|█████████████████████████████████████████████████████████████████████▏                 | 1392/1751 [23:15:19<6:02:09, 60.53s/it]                                                                                                                                     {'loss': '0.5134', 'grad_norm': '0.1846', 'learning_rate': '2.222e-06', 'ppl': '1.671', 'memory/max_active (GiB)': '76.14', 'memory/max_allocated (GiB)': '76.14', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '69.86', 'tokens/total': 2639889920, 'tokens/trainable': 976191872, 'epoch': '2.384'}
 79%|█████████████████████████████████████████████████████████████████████▏                 | 1392/1751 [23:15:19<6:02:09, 60.53s/it] 80%|█████████████████████████████████████████████████████████████████████▏                 | 1393/1751 [23:16:20<6:01:37, 60.61s/it]                                                                                                                                     {'loss': '0.5201', 'grad_norm': '0.1748', 'learning_rate': '2.21e-06', 'ppl': '1.682', 'memory/max_active (GiB)': '72.24', 'memory/max_allocated (GiB)': '72.24', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '57.32', 'tokens/total': 2641784576, 'tokens/trainable': 976900864, 'epoch': '2.386'}
 80%|█████████████████████████████████████████████████████████████████████▏                 | 1393/1751 [23:16:20<6:01:37, 60.61s/it] 80%|█████████████████████████████████████████████████████████████████████▎                 | 1394/1751 [23:17:20<5:59:28, 60.42s/it]                                                                                                                                     {'loss': '0.5036', 'grad_norm': '0.1777', 'learning_rate': '2.199e-06', 'ppl': '1.655', 'memory/max_active (GiB)': '76.08', 'memory/max_allocated (GiB)': '76.08', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '144.6', 'tokens/total': 2643645696, 'tokens/trainable': 977562176, 'epoch': '2.387'}
 80%|█████████████████████████████████████████████████████████████████████▎                 | 1394/1751 [23:17:20<5:59:28, 60.42s/it] 80%|█████████████████████████████████████████████████████████████████████▎                 | 1395/1751 [23:18:17<5:53:15, 59.54s/it]                                                                                                                                     {'loss': '0.5662', 'grad_norm': '0.1973', 'learning_rate': '2.187e-06', 'ppl': '1.762', 'memory/max_active (GiB)': '73.89', 'memory/max_allocated (GiB)': '73.89', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '125.5', 'tokens/total': 2645380608, 'tokens/trainable': 978224896, 'epoch': '2.389'}
 80%|█████████████████████████████████████████████████████████████████████▎                 | 1395/1751 [23:18:17<5:53:15, 59.54s/it] 80%|█████████████████████████████████████████████████████████████████████▎                 | 1396/1751 [23:19:18<5:54:36, 59.93s/it]                                                                                                                                     {'loss': '0.5069', 'grad_norm': '0.1709', 'learning_rate': '2.175e-06', 'ppl': '1.66', 'memory/max_active (GiB)': '75.3', 'memory/max_allocated (GiB)': '75.3', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '99.84', 'tokens/total': 2647321600, 'tokens/trainable': 978949568, 'epoch': '2.391'}
 80%|█████████████████████████████████████████████████████████████████████▎                 | 1396/1751 [23:19:18<5:54:36, 59.93s/it] 80%|█████████████████████████████████████████████████████████████████████▍                 | 1397/1751 [23:20:20<5:56:55, 60.50s/it]                                                                                                                                     {'loss': '0.5159', 'grad_norm': '0.1758', 'learning_rate': '2.163e-06', 'ppl': '1.675', 'memory/max_active (GiB)': '73.69', 'memory/max_allocated (GiB)': '73.69', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '71.98', 'tokens/total': 2649321728, 'tokens/trainable': 979654080, 'epoch': '2.392'}
 80%|█████████████████████████████████████████████████████████████████████▍                 | 1397/1751 [23:20:20<5:56:55, 60.50s/it] 80%|█████████████████████████████████████████████████████████████████████▍                 | 1398/1751 [23:21:19<5:53:45, 60.13s/it]                                                                                                                                     {'loss': '0.5081', 'grad_norm': '0.1719', 'learning_rate': '2.152e-06', 'ppl': '1.662', 'memory/max_active (GiB)': '75.83', 'memory/max_allocated (GiB)': '75.83', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '80.64', 'tokens/total': 2651199488, 'tokens/trainable': 980342848, 'epoch': '2.394'}
 80%|█████████████████████████████████████████████████████████████████████▍                 | 1398/1751 [23:21:19<5:53:45, 60.13s/it] 80%|█████████████████████████████████████████████████████████████████████▌                 | 1399/1751 [23:22:18<5:50:03, 59.67s/it]                                                                                                                                     {'loss': '0.563', 'grad_norm': '0.1836', 'learning_rate': '2.14e-06', 'ppl': '1.756', 'memory/max_active (GiB)': '74.16', 'memory/max_allocated (GiB)': '74.16', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '76.11', 'tokens/total': 2653012480, 'tokens/trainable': 981051072, 'epoch': '2.396'}
 80%|█████████████████████████████████████████████████████████████████████▌                 | 1399/1751 [23:22:18<5:50:03, 59.67s/it] 80%|█████████████████████████████████████████████████████████████████████▌                 | 1400/1751 [23:23:19<5:52:15, 60.21s/it]                                                                                                                                     {'loss': '0.5144', 'grad_norm': '0.1836', 'learning_rate': '2.128e-06', 'ppl': '1.673', 'memory/max_active (GiB)': '73.19', 'memory/max_allocated (GiB)': '73.19', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '157', 'tokens/total': 2654984192, 'tokens/trainable': 981757120, 'epoch': '2.398'}
 80%|█████████████████████████████████████████████████████████████████████▌                 | 1400/1751 [23:23:19<5:52:15, 60.21s/it] 80%|█████████████████████████████████████████████████████████████████████▌                 | 1401/1751 [23:24:18<5:49:10, 59.86s/it]                                                                                                                                     {'loss': '0.5143', 'grad_norm': '0.1895', 'learning_rate': '2.117e-06', 'ppl': '1.672', 'memory/max_active (GiB)': '75.81', 'memory/max_allocated (GiB)': '75.81', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '54.42', 'tokens/total': 2656834304, 'tokens/trainable': 982435520, 'epoch': '2.399'}
 80%|█████████████████████████████████████████████████████████████████████▌                 | 1401/1751 [23:24:18<5:49:10, 59.86s/it] 80%|█████████████████████████████████████████████████████████████████████▋                 | 1402/1751 [23:25:20<5:51:31, 60.43s/it]                                                                                                                                     {'loss': '0.5324', 'grad_norm': '0.1787', 'learning_rate': '2.105e-06', 'ppl': '1.703', 'memory/max_active (GiB)': '75.39', 'memory/max_allocated (GiB)': '75.39', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '87.29', 'tokens/total': 2658768384, 'tokens/trainable': 983172928, 'epoch': '2.401'}
 80%|█████████████████████████████████████████████████████████████████████▋                 | 1402/1751 [23:25:20<5:51:31, 60.43s/it] 80%|█████████████████████████████████████████████████████████████████████▋                 | 1403/1751 [23:26:22<5:52:58, 60.86s/it]                                                                                                                                     {'loss': '0.5267', 'grad_norm': '0.1807', 'learning_rate': '2.093e-06', 'ppl': '1.693', 'memory/max_active (GiB)': '71.1', 'memory/max_allocated (GiB)': '71.1', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '75.77', 'tokens/total': 2660723456, 'tokens/trainable': 983910464, 'epoch': '2.403'}
 80%|█████████████████████████████████████████████████████████████████████▋                 | 1403/1751 [23:26:22<5:52:58, 60.86s/it] 80%|█████████████████████████████████████████████████████████████████████▊                 | 1404/1751 [23:27:23<5:52:58, 61.03s/it]                                                                                                                                     {'loss': '0.5143', 'grad_norm': '0.1797', 'learning_rate': '2.082e-06', 'ppl': '1.672', 'memory/max_active (GiB)': '73.89', 'memory/max_allocated (GiB)': '73.89', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '128.3', 'tokens/total': 2662648320, 'tokens/trainable': 984627392, 'epoch': '2.404'}
 80%|█████████████████████████████████████████████████████████████████████▊                 | 1404/1751 [23:27:23<5:52:58, 61.03s/it] 80%|█████████████████████████████████████████████████████████████████████▊                 | 1405/1751 [23:28:24<5:50:54, 60.85s/it]                                                                                                                                     {'loss': '0.5224', 'grad_norm': '0.1816', 'learning_rate': '2.07e-06', 'ppl': '1.686', 'memory/max_active (GiB)': '77.04', 'memory/max_allocated (GiB)': '77.04', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '245.3', 'tokens/total': 2664518400, 'tokens/trainable': 985310976, 'epoch': '2.406'}
 80%|█████████████████████████████████████████████████████████████████████▊                 | 1405/1751 [23:28:24<5:50:54, 60.85s/it] 80%|█████████████████████████████████████████████████████████████████████▊                 | 1406/1751 [23:29:26<5:52:08, 61.24s/it]                                                                                                                                     {'loss': '0.505', 'grad_norm': '0.1826', 'learning_rate': '2.059e-06', 'ppl': '1.657', 'memory/max_active (GiB)': '70.84', 'memory/max_allocated (GiB)': '70.84', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '71.49', 'tokens/total': 2666457344, 'tokens/trainable': 986032064, 'epoch': '2.408'}
 80%|█████████████████████████████████████████████████████████████████████▊                 | 1406/1751 [23:29:26<5:52:08, 61.24s/it] 80%|█████████████████████████████████████████████████████████████████████▉                 | 1407/1751 [23:30:25<5:47:06, 60.54s/it]                                                                                                                                     {'loss': '0.5096', 'grad_norm': '0.1816', 'learning_rate': '2.047e-06', 'ppl': '1.665', 'memory/max_active (GiB)': '75.34', 'memory/max_allocated (GiB)': '75.34', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '63.44', 'tokens/total': 2668345344, 'tokens/trainable': 986723584, 'epoch': '2.41'}
 80%|█████████████████████████████████████████████████████████████████████▉                 | 1407/1751 [23:30:25<5:47:06, 60.54s/it] 80%|█████████████████████████████████████████████████████████████████████▉                 | 1408/1751 [23:31:26<5:47:43, 60.83s/it]                                                                                                                                     {'loss': '0.4974', 'grad_norm': '0.1738', 'learning_rate': '2.036e-06', 'ppl': '1.644', 'memory/max_active (GiB)': '74.47', 'memory/max_allocated (GiB)': '74.47', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '164', 'tokens/total': 2670328064, 'tokens/trainable': 987444992, 'epoch': '2.411'}
 80%|█████████████████████████████████████████████████████████████████████▉                 | 1408/1751 [23:31:26<5:47:43, 60.83s/it] 80%|██████████████████████████████████████████████████████████████████████                 | 1409/1751 [23:32:27<5:46:35, 60.81s/it]                                                                                                                                     {'loss': '0.5151', 'grad_norm': '0.1846', 'learning_rate': '2.025e-06', 'ppl': '1.674', 'memory/max_active (GiB)': '75.35', 'memory/max_allocated (GiB)': '75.35', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '138.3', 'tokens/total': 2672250368, 'tokens/trainable': 988152704, 'epoch': '2.413'}
 80%|██████████████████████████████████████████████████████████████████████                 | 1409/1751 [23:32:27<5:46:35, 60.81s/it] 81%|██████████████████████████████████████████████████████████████████████                 | 1410/1751 [23:33:24<5:38:00, 59.47s/it]                                                                                                                                     {'loss': '0.5398', 'grad_norm': '0.1875', 'learning_rate': '2.013e-06', 'ppl': '1.716', 'memory/max_active (GiB)': '71.72', 'memory/max_allocated (GiB)': '71.72', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '42.16', 'tokens/total': 2674078720, 'tokens/trainable': 988829888, 'epoch': '2.415'}
 81%|██████████████████████████████████████████████████████████████████████                 | 1410/1751 [23:33:24<5:38:00, 59.47s/it] 81%|██████████████████████████████████████████████████████████████████████                 | 1411/1751 [23:34:24<5:39:11, 59.86s/it]                                                                                                                                     {'loss': '0.5293', 'grad_norm': '0.1797', 'learning_rate': '2.002e-06', 'ppl': '1.698', 'memory/max_active (GiB)': '76.86', 'memory/max_allocated (GiB)': '76.86', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '64.31', 'tokens/total': 2675995392, 'tokens/trainable': 989542656, 'epoch': '2.416'}
 81%|██████████████████████████████████████████████████████████████████████                 | 1411/1751 [23:34:24<5:39:11, 59.86s/it] 81%|██████████████████████████████████████████████████████████████████████▏                | 1412/1751 [23:35:24<5:38:18, 59.88s/it]                                                                                                                                     {'loss': '0.5198', 'grad_norm': '0.1865', 'learning_rate': '1.99e-06', 'ppl': '1.682', 'memory/max_active (GiB)': '73.25', 'memory/max_allocated (GiB)': '73.25', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '73.67', 'tokens/total': 2677877248, 'tokens/trainable': 990217600, 'epoch': '2.418'}
 81%|██████████████████████████████████████████████████████████████████████▏                | 1412/1751 [23:35:24<5:38:18, 59.88s/it] 81%|██████████████████████████████████████████████████████████████████████▏                | 1413/1751 [23:36:28<5:44:09, 61.09s/it]                                                                                                                                     {'loss': '0.4885', 'grad_norm': '0.168', 'learning_rate': '1.979e-06', 'ppl': '1.63', 'memory/max_active (GiB)': '73.74', 'memory/max_allocated (GiB)': '73.74', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '75.56', 'tokens/total': 2679955712, 'tokens/trainable': 990999296, 'epoch': '2.42'}
 81%|██████████████████████████████████████████████████████████████████████▏                | 1413/1751 [23:36:28<5:44:09, 61.09s/it] 81%|██████████████████████████████████████████████████████████████████████▎                | 1414/1751 [23:37:27<5:39:59, 60.53s/it]                                                                                                                                     {'loss': '0.5121', 'grad_norm': '0.1836', 'learning_rate': '1.968e-06', 'ppl': '1.669', 'memory/max_active (GiB)': '73.98', 'memory/max_allocated (GiB)': '73.98', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '76.43', 'tokens/total': 2681843200, 'tokens/trainable': 991672704, 'epoch': '2.422'}
 81%|██████████████████████████████████████████████████████████████████████▎                | 1414/1751 [23:37:27<5:39:59, 60.53s/it] 81%|██████████████████████████████████████████████████████████████████████▎                | 1415/1751 [23:38:25<5:34:38, 59.76s/it]                                                                                                                                     {'loss': '0.5439', 'grad_norm': '0.1943', 'learning_rate': '1.957e-06', 'ppl': '1.723', 'memory/max_active (GiB)': '76.94', 'memory/max_allocated (GiB)': '76.94', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '152.6', 'tokens/total': 2683646720, 'tokens/trainable': 992375360, 'epoch': '2.423'}
 81%|██████████████████████████████████████████████████████████████████████▎                | 1415/1751 [23:38:25<5:34:38, 59.76s/it] 81%|██████████████████████████████████████████████████████████████████████▎                | 1416/1751 [23:39:25<5:33:03, 59.65s/it]                                                                                                                                     {'loss': '0.5254', 'grad_norm': '0.1875', 'learning_rate': '1.945e-06', 'ppl': '1.691', 'memory/max_active (GiB)': '76.75', 'memory/max_allocated (GiB)': '76.75', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '94.31', 'tokens/total': 2685504000, 'tokens/trainable': 993048384, 'epoch': '2.425'}
 81%|██████████████████████████████████████████████████████████████████████▎                | 1416/1751 [23:39:25<5:33:03, 59.65s/it] 81%|██████████████████████████████████████████████████████████████████████▍                | 1417/1751 [23:40:21<5:26:48, 58.71s/it]                                                                                                                                     {'loss': '0.552', 'grad_norm': '0.1963', 'learning_rate': '1.934e-06', 'ppl': '1.737', 'memory/max_active (GiB)': '69.43', 'memory/max_allocated (GiB)': '69.43', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '113.1', 'tokens/total': 2687267840, 'tokens/trainable': 993664832, 'epoch': '2.427'}
 81%|██████████████████████████████████████████████████████████████████████▍                | 1417/1751 [23:40:21<5:26:48, 58.71s/it] 81%|██████████████████████████████████████████████████████████████████████▍                | 1418/1751 [23:41:20<5:25:23, 58.63s/it]                                                                                                                                     {'loss': '0.5335', 'grad_norm': '0.1855', 'learning_rate': '1.923e-06', 'ppl': '1.705', 'memory/max_active (GiB)': '69.93', 'memory/max_allocated (GiB)': '69.93', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '116', 'tokens/total': 2689062400, 'tokens/trainable': 994317312, 'epoch': '2.428'}
 81%|██████████████████████████████████████████████████████████████████████▍                | 1418/1751 [23:41:20<5:25:23, 58.63s/it] 81%|██████████████████████████████████████████████████████████████████████▌                | 1419/1751 [23:42:21<5:29:18, 59.51s/it]                                                                                                                                     {'loss': '0.5139', 'grad_norm': '0.1826', 'learning_rate': '1.912e-06', 'ppl': '1.672', 'memory/max_active (GiB)': '74.77', 'memory/max_allocated (GiB)': '74.77', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '46.96', 'tokens/total': 2691066624, 'tokens/trainable': 995042176, 'epoch': '2.43'}
 81%|██████████████████████████████████████████████████████████████████████▌                | 1419/1751 [23:42:21<5:29:18, 59.51s/it] 81%|██████████████████████████████████████████████████████████████████████▌                | 1420/1751 [23:43:25<5:35:38, 60.84s/it]                                                                                                                                     {'loss': '0.4678', 'grad_norm': '0.168', 'learning_rate': '1.901e-06', 'ppl': '1.597', 'memory/max_active (GiB)': '77.25', 'memory/max_allocated (GiB)': '77.25', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '72.85', 'tokens/total': 2693125376, 'tokens/trainable': 995778816, 'epoch': '2.432'}
 81%|██████████████████████████████████████████████████████████████████████▌                | 1420/1751 [23:43:25<5:35:38, 60.84s/it] 81%|██████████████████████████████████████████████████████████████████████▌                | 1421/1751 [23:44:26<5:35:21, 60.97s/it]                                                                                                                                     {'loss': '0.4836', 'grad_norm': '0.1729', 'learning_rate': '1.89e-06', 'ppl': '1.622', 'memory/max_active (GiB)': '71.66', 'memory/max_allocated (GiB)': '71.66', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '192.1', 'tokens/total': 2695092736, 'tokens/trainable': 996506880, 'epoch': '2.434'}
 81%|██████████████████████████████████████████████████████████████████████▌                | 1421/1751 [23:44:26<5:35:21, 60.97s/it] 81%|██████████████████████████████████████████████████████████████████████▋                | 1422/1751 [23:45:28<5:35:20, 61.16s/it]                                                                                                                                     {'loss': '0.5385', 'grad_norm': '0.1777', 'learning_rate': '1.879e-06', 'ppl': '1.713', 'memory/max_active (GiB)': '75.12', 'memory/max_allocated (GiB)': '75.12', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '63.3', 'tokens/total': 2697052928, 'tokens/trainable': 997251264, 'epoch': '2.435'}
 81%|██████████████████████████████████████████████████████████████████████▋                | 1422/1751 [23:45:28<5:35:20, 61.16s/it] 81%|██████████████████████████████████████████████████████████████████████▋                | 1423/1751 [23:46:27<5:30:15, 60.41s/it]                                                                                                                                     {'loss': '0.5118', 'grad_norm': '0.1807', 'learning_rate': '1.868e-06', 'ppl': '1.668', 'memory/max_active (GiB)': '74.92', 'memory/max_allocated (GiB)': '74.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '51.78', 'tokens/total': 2698894848, 'tokens/trainable': 997915072, 'epoch': '2.437'}
 81%|██████████████████████████████████████████████████████████████████████▋                | 1423/1751 [23:46:27<5:30:15, 60.41s/it] 81%|██████████████████████████████████████████████████████████████████████▊                | 1424/1751 [23:47:27<5:28:18, 60.24s/it]                                                                                                                                     {'loss': '0.4767', 'grad_norm': '0.1699', 'learning_rate': '1.857e-06', 'ppl': '1.611', 'memory/max_active (GiB)': '74.94', 'memory/max_allocated (GiB)': '74.94', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '102.2', 'tokens/total': 2700785664, 'tokens/trainable': 998587136, 'epoch': '2.439'}
 81%|██████████████████████████████████████████████████████████████████████▊                | 1424/1751 [23:47:27<5:28:18, 60.24s/it] 81%|██████████████████████████████████████████████████████████████████████▊                | 1425/1751 [23:48:26<5:25:16, 59.87s/it]                                                                                                                                     {'loss': '0.5296', 'grad_norm': '0.1875', 'learning_rate': '1.846e-06', 'ppl': '1.698', 'memory/max_active (GiB)': '75.99', 'memory/max_allocated (GiB)': '75.99', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '194.1', 'tokens/total': 2702717184, 'tokens/trainable': 999321856, 'epoch': '2.44'}
 81%|██████████████████████████████████████████████████████████████████████▊                | 1425/1751 [23:48:26<5:25:16, 59.87s/it] 81%|██████████████████████████████████████████████████████████████████████▊                | 1426/1751 [23:49:25<5:23:33, 59.73s/it]                                                                                                                                     {'loss': '0.5335', 'grad_norm': '0.1904', 'learning_rate': '1.835e-06', 'ppl': '1.705', 'memory/max_active (GiB)': '75.66', 'memory/max_allocated (GiB)': '75.66', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '46.32', 'tokens/total': 2704666112, 'tokens/trainable': 1000017216, 'epoch': '2.442'}
 81%|██████████████████████████████████████████████████████████████████████▊                | 1426/1751 [23:49:25<5:23:33, 59.73s/it] 81%|██████████████████████████████████████████████████████████████████████▉                | 1427/1751 [23:50:28<5:27:04, 60.57s/it]                                                                                                                                     {'loss': '0.5048', 'grad_norm': '0.165', 'learning_rate': '1.824e-06', 'ppl': '1.657', 'memory/max_active (GiB)': '75.31', 'memory/max_allocated (GiB)': '75.31', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '79.65', 'tokens/total': 2706691584, 'tokens/trainable': 1000778944, 'epoch': '2.444'}
 81%|██████████████████████████████████████████████████████████████████████▉                | 1427/1751 [23:50:28<5:27:04, 60.57s/it] 82%|██████████████████████████████████████████████████████████████████████▉                | 1428/1751 [23:51:28<5:25:47, 60.52s/it]                                                                                                                                     {'loss': '0.5034', 'grad_norm': '0.1631', 'learning_rate': '1.813e-06', 'ppl': '1.654', 'memory/max_active (GiB)': '75.8', 'memory/max_allocated (GiB)': '75.8', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '170.3', 'tokens/total': 2708635392, 'tokens/trainable': 1001510720, 'epoch': '2.446'}
 82%|██████████████████████████████████████████████████████████████████████▉                | 1428/1751 [23:51:28<5:25:47, 60.52s/it] 82%|███████████████████████████████████████████████████████████████████████                | 1429/1751 [23:52:26<5:20:54, 59.80s/it]                                                                                                                                     {'loss': '0.5561', 'grad_norm': '0.1924', 'learning_rate': '1.802e-06', 'ppl': '1.744', 'memory/max_active (GiB)': '72.92', 'memory/max_allocated (GiB)': '72.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '85.07', 'tokens/total': 2710474496, 'tokens/trainable': 1002182720, 'epoch': '2.447'}
 82%|███████████████████████████████████████████████████████████████████████                | 1429/1751 [23:52:26<5:20:54, 59.80s/it] 82%|███████████████████████████████████████████████████████████████████████                | 1430/1751 [23:53:23<5:16:06, 59.09s/it]                                                                                                                                     {'loss': '0.5344', 'grad_norm': '0.1904', 'learning_rate': '1.792e-06', 'ppl': '1.706', 'memory/max_active (GiB)': '70.84', 'memory/max_allocated (GiB)': '70.84', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '96.34', 'tokens/total': 2712323584, 'tokens/trainable': 1002877952, 'epoch': '2.449'}
 82%|███████████████████████████████████████████████████████████████████████                | 1430/1751 [23:53:23<5:16:06, 59.09s/it] 82%|███████████████████████████████████████████████████████████████████████                | 1431/1751 [23:54:21<5:12:07, 58.52s/it]                                                                                                                                     {'loss': '0.5582', 'grad_norm': '0.1875', 'learning_rate': '1.781e-06', 'ppl': '1.748', 'memory/max_active (GiB)': '69.89', 'memory/max_allocated (GiB)': '69.89', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '106', 'tokens/total': 2714126336, 'tokens/trainable': 1003561280, 'epoch': '2.451'}
 82%|███████████████████████████████████████████████████████████████████████                | 1431/1751 [23:54:21<5:12:07, 58.52s/it] 82%|███████████████████████████████████████████████████████████████████████▏               | 1432/1751 [23:55:22<5:14:52, 59.22s/it]                                                                                                                                     {'loss': '0.4954', 'grad_norm': '0.166', 'learning_rate': '1.77e-06', 'ppl': '1.641', 'memory/max_active (GiB)': '72.98', 'memory/max_allocated (GiB)': '72.98', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '113.1', 'tokens/total': 2716091648, 'tokens/trainable': 1004314816, 'epoch': '2.452'}
 82%|███████████████████████████████████████████████████████████████████████▏               | 1432/1751 [23:55:22<5:14:52, 59.22s/it] 82%|███████████████████████████████████████████████████████████████████████▏               | 1433/1751 [23:56:21<5:13:35, 59.17s/it]                                                                                                                                     {'loss': '0.5364', 'grad_norm': '0.1875', 'learning_rate': '1.759e-06', 'ppl': '1.71', 'memory/max_active (GiB)': '75.07', 'memory/max_allocated (GiB)': '75.07', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '173', 'tokens/total': 2717935872, 'tokens/trainable': 1004979840, 'epoch': '2.454'}
 82%|███████████████████████████████████████████████████████████████████████▏               | 1433/1751 [23:56:21<5:13:35, 59.17s/it] 82%|███████████████████████████████████████████████████████████████████████▏               | 1434/1751 [23:57:18<5:10:40, 58.80s/it]                                                                                                                                     {'loss': '0.5375', 'grad_norm': '0.1865', 'learning_rate': '1.749e-06', 'ppl': '1.712', 'memory/max_active (GiB)': '74.87', 'memory/max_allocated (GiB)': '74.87', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '91.13', 'tokens/total': 2719731200, 'tokens/trainable': 1005642496, 'epoch': '2.456'}
 82%|███████████████████████████████████████████████████████████████████████▏               | 1434/1751 [23:57:19<5:10:40, 58.80s/it] 82%|███████████████████████████████████████████████████████████████████████▎               | 1435/1751 [23:58:17<5:08:39, 58.61s/it]                                                                                                                                     {'loss': '0.5065', 'grad_norm': '0.1816', 'learning_rate': '1.738e-06', 'ppl': '1.659', 'memory/max_active (GiB)': '68.17', 'memory/max_allocated (GiB)': '68.17', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '60.91', 'tokens/total': 2721521408, 'tokens/trainable': 1006324288, 'epoch': '2.458'}
 82%|███████████████████████████████████████████████████████████████████████▎               | 1435/1751 [23:58:17<5:08:39, 58.61s/it] 82%|███████████████████████████████████████████████████████████████████████▎               | 1436/1751 [23:59:17<5:09:57, 59.04s/it]                                                                                                                                     {'loss': '0.527', 'grad_norm': '0.1797', 'learning_rate': '1.727e-06', 'ppl': '1.694', 'memory/max_active (GiB)': '76.49', 'memory/max_allocated (GiB)': '76.49', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '100.9', 'tokens/total': 2723430912, 'tokens/trainable': 1007030912, 'epoch': '2.459'}
 82%|███████████████████████████████████████████████████████████████████████▎               | 1436/1751 [23:59:17<5:09:57, 59.04s/it] 82%|███████████████████████████████████████████████████████████████████████▍               | 1437/1751 [24:00:13<5:04:43, 58.23s/it]                                                                                                                                     {'loss': '0.5272', 'grad_norm': '0.1904', 'learning_rate': '1.717e-06', 'ppl': '1.694', 'memory/max_active (GiB)': '74.93', 'memory/max_allocated (GiB)': '74.93', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '74.82', 'tokens/total': 2725177600, 'tokens/trainable': 1007687168, 'epoch': '2.461'}
 82%|███████████████████████████████████████████████████████████████████████▍               | 1437/1751 [24:00:13<5:04:43, 58.23s/it] 82%|███████████████████████████████████████████████████████████████████████▍               | 1438/1751 [24:01:11<5:03:28, 58.17s/it]                                                                                                                                     {'loss': '0.5382', 'grad_norm': '0.1768', 'learning_rate': '1.706e-06', 'ppl': '1.713', 'memory/max_active (GiB)': '73.09', 'memory/max_allocated (GiB)': '73.09', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '97.24', 'tokens/total': 2727022592, 'tokens/trainable': 1008372800, 'epoch': '2.463'}
 82%|███████████████████████████████████████████████████████████████████████▍               | 1438/1751 [24:01:11<5:03:28, 58.17s/it] 82%|███████████████████████████████████████████████████████████████████████▍               | 1439/1751 [24:02:07<4:58:31, 57.41s/it]                                                                                                                                     {'loss': '0.5501', 'grad_norm': '0.1914', 'learning_rate': '1.696e-06', 'ppl': '1.733', 'memory/max_active (GiB)': '74.49', 'memory/max_allocated (GiB)': '74.49', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '98.73', 'tokens/total': 2728731392, 'tokens/trainable': 1009004992, 'epoch': '2.464'}
 82%|███████████████████████████████████████████████████████████████████████▍               | 1439/1751 [24:02:07<4:58:31, 57.41s/it] 82%|███████████████████████████████████████████████████████████████████████▌               | 1440/1751 [24:03:04<4:57:15, 57.35s/it]                                                                                                                                     {'loss': '0.5727', 'grad_norm': '0.1885', 'learning_rate': '1.685e-06', 'ppl': '1.773', 'memory/max_active (GiB)': '73.56', 'memory/max_allocated (GiB)': '73.56', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '14.04', 'tokens/total': 2730552576, 'tokens/trainable': 1009684672, 'epoch': '2.466'}
 82%|███████████████████████████████████████████████████████████████████████▌               | 1440/1751 [24:03:04<4:57:15, 57.35s/it] 82%|███████████████████████████████████████████████████████████████████████▌               | 1441/1751 [24:04:04<4:59:55, 58.05s/it]                                                                                                                                     {'loss': '0.5131', 'grad_norm': '0.1768', 'learning_rate': '1.675e-06', 'ppl': '1.671', 'memory/max_active (GiB)': '74.82', 'memory/max_allocated (GiB)': '74.82', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '78.74', 'tokens/total': 2732455168, 'tokens/trainable': 1010368896, 'epoch': '2.468'}
 82%|███████████████████████████████████████████████████████████████████████▌               | 1441/1751 [24:04:04<4:59:55, 58.05s/it] 82%|███████████████████████████████████████████████████████████████████████▋               | 1442/1751 [24:05:03<5:01:37, 58.57s/it]                                                                                                                                     {'loss': '0.575', 'grad_norm': '0.1943', 'learning_rate': '1.664e-06', 'ppl': '1.777', 'memory/max_active (GiB)': '75.03', 'memory/max_allocated (GiB)': '75.03', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '80.99', 'tokens/total': 2734364416, 'tokens/trainable': 1011037760, 'epoch': '2.47'}
 82%|███████████████████████████████████████████████████████████████████████▋               | 1442/1751 [24:05:03<5:01:37, 58.57s/it] 82%|███████████████████████████████████████████████████████████████████████▋               | 1443/1751 [24:06:04<5:03:53, 59.20s/it]                                                                                                                                     {'loss': '0.5289', 'grad_norm': '0.1777', 'learning_rate': '1.654e-06', 'ppl': '1.697', 'memory/max_active (GiB)': '75.21', 'memory/max_allocated (GiB)': '75.21', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '89.2', 'tokens/total': 2736276224, 'tokens/trainable': 1011748352, 'epoch': '2.471'}
 82%|███████████████████████████████████████████████████████████████████████▋               | 1443/1751 [24:06:04<5:03:53, 59.20s/it] 82%|███████████████████████████████████████████████████████████████████████▋               | 1444/1751 [24:07:05<5:04:50, 59.58s/it]                                                                                                                                     {'loss': '0.4937', 'grad_norm': '0.1777', 'learning_rate': '1.644e-06', 'ppl': '1.638', 'memory/max_active (GiB)': '76.39', 'memory/max_allocated (GiB)': '76.39', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '97.31', 'tokens/total': 2738196480, 'tokens/trainable': 1012438272, 'epoch': '2.473'}
 82%|███████████████████████████████████████████████████████████████████████▋               | 1444/1751 [24:07:05<5:04:50, 59.58s/it] 83%|███████████████████████████████████████████████████████████████████████▊               | 1445/1751 [24:08:04<5:04:07, 59.63s/it]                                                                                                                                     {'loss': '0.5237', 'grad_norm': '0.1807', 'learning_rate': '1.633e-06', 'ppl': '1.688', 'memory/max_active (GiB)': '74.5', 'memory/max_allocated (GiB)': '74.5', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '22.07', 'tokens/total': 2740125696, 'tokens/trainable': 1013163584, 'epoch': '2.475'}
 83%|███████████████████████████████████████████████████████████████████████▊               | 1445/1751 [24:08:04<5:04:07, 59.63s/it] 83%|███████████████████████████████████████████████████████████████████████▊               | 1446/1751 [24:09:06<5:06:13, 60.24s/it]                                                                                                                                     {'loss': '0.4926', 'grad_norm': '0.1699', 'learning_rate': '1.623e-06', 'ppl': '1.637', 'memory/max_active (GiB)': '75.65', 'memory/max_allocated (GiB)': '75.65', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '86.26', 'tokens/total': 2742064128, 'tokens/trainable': 1013903104, 'epoch': '2.476'}
 83%|███████████████████████████████████████████████████████████████████████▊               | 1446/1751 [24:09:06<5:06:13, 60.24s/it] 83%|███████████████████████████████████████████████████████████████████████▉               | 1447/1751 [24:10:04<5:02:26, 59.69s/it]                                                                                                                                     {'loss': '0.5507', 'grad_norm': '0.1895', 'learning_rate': '1.613e-06', 'ppl': '1.735', 'memory/max_active (GiB)': '74.09', 'memory/max_allocated (GiB)': '74.09', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '44.39', 'tokens/total': 2743922176, 'tokens/trainable': 1014606016, 'epoch': '2.478'}
 83%|███████████████████████████████████████████████████████████████████████▉               | 1447/1751 [24:10:04<5:02:26, 59.69s/it] 83%|███████████████████████████████████████████████████████████████████████▉               | 1448/1751 [24:11:08<5:06:43, 60.74s/it]                                                                                                                                     {'loss': '0.502', 'grad_norm': '0.1641', 'learning_rate': '1.602e-06', 'ppl': '1.652', 'memory/max_active (GiB)': '77.15', 'memory/max_allocated (GiB)': '77.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '186.7', 'tokens/total': 2746012672, 'tokens/trainable': 1015369792, 'epoch': '2.48'}
 83%|███████████████████████████████████████████████████████████████████████▉               | 1448/1751 [24:11:08<5:06:43, 60.74s/it] 83%|███████████████████████████████████████████████████████████████████████▉               | 1449/1751 [24:12:10<5:08:21, 61.26s/it]                                                                                                                                     {'loss': '0.5182', 'grad_norm': '0.167', 'learning_rate': '1.592e-06', 'ppl': '1.679', 'memory/max_active (GiB)': '76.6', 'memory/max_allocated (GiB)': '76.6', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '122.4', 'tokens/total': 2748038400, 'tokens/trainable': 1016094656, 'epoch': '2.482'}
 83%|███████████████████████████████████████████████████████████████████████▉               | 1449/1751 [24:12:10<5:08:21, 61.26s/it] 83%|████████████████████████████████████████████████████████████████████████               | 1450/1751 [24:13:12<5:09:08, 61.62s/it]                                                                                                                                     {'loss': '0.5402', 'grad_norm': '0.1719', 'learning_rate': '1.582e-06', 'ppl': '1.716', 'memory/max_active (GiB)': '75.87', 'memory/max_allocated (GiB)': '75.87', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '74.44', 'tokens/total': 2750084864, 'tokens/trainable': 1016846208, 'epoch': '2.483'}
 83%|████████████████████████████████████████████████████████████████████████               | 1450/1751 [24:13:12<5:09:08, 61.62s/it] 83%|████████████████████████████████████████████████████████████████████████               | 1451/1751 [24:14:15<5:09:00, 61.80s/it]                                                                                                                                     {'loss': '0.5092', 'grad_norm': '0.1699', 'learning_rate': '1.572e-06', 'ppl': '1.664', 'memory/max_active (GiB)': '76.25', 'memory/max_allocated (GiB)': '76.25', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '57.16', 'tokens/total': 2752064000, 'tokens/trainable': 1017581184, 'epoch': '2.485'}
 83%|████████████████████████████████████████████████████████████████████████               | 1451/1751 [24:14:15<5:09:00, 61.80s/it] 83%|████████████████████████████████████████████████████████████████████████▏              | 1452/1751 [24:15:14<5:04:23, 61.08s/it]                                                                                                                                     {'loss': '0.5204', 'grad_norm': '0.1816', 'learning_rate': '1.562e-06', 'ppl': '1.683', 'memory/max_active (GiB)': '72.67', 'memory/max_allocated (GiB)': '72.67', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '83.99', 'tokens/total': 2753928192, 'tokens/trainable': 1018269248, 'epoch': '2.487'}
 83%|████████████████████████████████████████████████████████████████████████▏              | 1452/1751 [24:15:14<5:04:23, 61.08s/it] 83%|████████████████████████████████████████████████████████████████████████▏              | 1453/1751 [24:16:14<5:01:26, 60.69s/it]                                                                                                                                     {'loss': '0.535', 'grad_norm': '0.1807', 'learning_rate': '1.551e-06', 'ppl': '1.707', 'memory/max_active (GiB)': '71.79', 'memory/max_allocated (GiB)': '71.79', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '16.71', 'tokens/total': 2755788800, 'tokens/trainable': 1018959616, 'epoch': '2.488'}
 83%|████████████████████████████████████████████████████████████████████████▏              | 1453/1751 [24:16:14<5:01:26, 60.69s/it] 83%|████████████████████████████████████████████████████████████████████████▏              | 1454/1751 [24:17:13<4:57:36, 60.12s/it]                                                                                                                                     {'loss': '0.5235', 'grad_norm': '0.1777', 'learning_rate': '1.541e-06', 'ppl': '1.688', 'memory/max_active (GiB)': '78.02', 'memory/max_allocated (GiB)': '78.02', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '16.11', 'tokens/total': 2757590784, 'tokens/trainable': 1019618944, 'epoch': '2.49'}
 83%|████████████████████████████████████████████████████████████████████████▏              | 1454/1751 [24:17:13<4:57:36, 60.12s/it] 83%|████████████████████████████████████████████████████████████████████████▎              | 1455/1751 [24:18:15<5:00:19, 60.88s/it]                                                                                                                                     {'loss': '0.4781', 'grad_norm': '0.1631', 'learning_rate': '1.531e-06', 'ppl': '1.613', 'memory/max_active (GiB)': '72.82', 'memory/max_allocated (GiB)': '72.82', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '76.86', 'tokens/total': 2759632128, 'tokens/trainable': 1020360576, 'epoch': '2.492'}
 83%|████████████████████████████████████████████████████████████████████████▎              | 1455/1751 [24:18:15<5:00:19, 60.88s/it] 83%|████████████████████████████████████████████████████████████████████████▎              | 1456/1751 [24:19:16<4:59:10, 60.85s/it]                                                                                                                                     {'loss': '0.5196', 'grad_norm': '0.1826', 'learning_rate': '1.521e-06', 'ppl': '1.681', 'memory/max_active (GiB)': '72.85', 'memory/max_allocated (GiB)': '72.85', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '95.46', 'tokens/total': 2761503232, 'tokens/trainable': 1021041088, 'epoch': '2.494'}
 83%|████████████████████████████████████████████████████████████████████████▎              | 1456/1751 [24:19:16<4:59:10, 60.85s/it] 83%|████████████████████████████████████████████████████████████████████████▍              | 1457/1751 [24:20:19<5:01:55, 61.62s/it]                                                                                                                                     {'loss': '0.4898', 'grad_norm': '0.165', 'learning_rate': '1.511e-06', 'ppl': '1.632', 'memory/max_active (GiB)': '72.96', 'memory/max_allocated (GiB)': '72.96', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '63.32', 'tokens/total': 2763552512, 'tokens/trainable': 1021786752, 'epoch': '2.495'}
 83%|████████████████████████████████████████████████████████████████████████▍              | 1457/1751 [24:20:20<5:01:55, 61.62s/it] 83%|████████████████████████████████████████████████████████████████████████▍              | 1458/1751 [24:21:24<5:04:30, 62.36s/it]                                                                                                                                     {'loss': '0.4994', 'grad_norm': '0.1631', 'learning_rate': '1.501e-06', 'ppl': '1.648', 'memory/max_active (GiB)': '73.55', 'memory/max_allocated (GiB)': '73.55', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '122.3', 'tokens/total': 2765604608, 'tokens/trainable': 1022539200, 'epoch': '2.497'}
 83%|████████████████████████████████████████████████████████████████████████▍              | 1458/1751 [24:21:24<5:04:30, 62.36s/it] 83%|████████████████████████████████████████████████████████████████████████▍              | 1459/1751 [24:22:22<4:57:02, 61.04s/it]                                                                                                                                     {'loss': '0.5364', 'grad_norm': '0.1807', 'learning_rate': '1.491e-06', 'ppl': '1.71', 'memory/max_active (GiB)': '73.51', 'memory/max_allocated (GiB)': '73.51', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '43.69', 'tokens/total': 2767396864, 'tokens/trainable': 1023190528, 'epoch': '2.499'}
 83%|████████████████████████████████████████████████████████████████████████▍              | 1459/1751 [24:22:22<4:57:02, 61.04s/it] 83%|████████████████████████████████████████████████████████████████████████▌              | 1460/1751 [24:23:21<4:53:40, 60.55s/it]                                                                                                                                     {'loss': '0.5222', 'grad_norm': '0.1807', 'learning_rate': '1.481e-06', 'ppl': '1.686', 'memory/max_active (GiB)': '72.28', 'memory/max_allocated (GiB)': '72.28', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '111.4', 'tokens/total': 2769285120, 'tokens/trainable': 1023922304, 'epoch': '2.5'}
 83%|████████████████████████████████████████████████████████████████████████▌              | 1460/1751 [24:23:21<4:53:40, 60.55s/it] 83%|████████████████████████████████████████████████████████████████████████▌              | 1461/1751 [24:24:21<4:52:00, 60.41s/it]                                                                                                                                     {'loss': '0.5107', 'grad_norm': '0.1748', 'learning_rate': '1.472e-06', 'ppl': '1.666', 'memory/max_active (GiB)': '74.99', 'memory/max_allocated (GiB)': '74.99', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '54.3', 'tokens/total': 2771201792, 'tokens/trainable': 1024619840, 'epoch': '2.502'}
 83%|████████████████████████████████████████████████████████████████████████▌              | 1461/1751 [24:24:21<4:52:00, 60.41s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                            | 1462/1751 [24:25:22<4:51:44, 60.57s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4994', 'grad_norm': '0.1748', 'learning_rate': '1.462e-06', 'ppl': '1.648', 'memory/max_active (GiB)': '76.27', 'memory/max_allocated (GiB)': '76.27', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '10.07', 'tokens/total': 2773145856, 'tokens/trainable': 1025312768, 'epoch': '2.504'}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                            | 1462/1751 [24:25:22<4:51:44, 60.57s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                            | 1463/1751 [24:26:22<4:50:19, 60.49s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5507', 'grad_norm': '0.1904', 'learning_rate': '1.452e-06', 'ppl': '1.734', 'memory/max_active (GiB)': '76.38', 'memory/max_allocated (GiB)': '76.38', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '112.8', 'tokens/total': 2774992384, 'tokens/trainable': 1025997760, 'epoch': '2.506'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                            | 1463/1751 [24:26:22<4:50:19, 60.49s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                            | 1464/1751 [24:27:25<4:52:15, 61.10s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5223', 'grad_norm': '0.1807', 'learning_rate': '1.442e-06', 'ppl': '1.686', 'memory/max_active (GiB)': '74.73', 'memory/max_allocated (GiB)': '74.73', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '38.3', 'tokens/total': 2776896256, 'tokens/trainable': 1026728320, 'epoch': '2.507'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                            | 1464/1751 [24:27:25<4:52:15, 61.10s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                           | 1465/1751 [24:28:24<4:49:13, 60.68s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4955', 'grad_norm': '0.1816', 'learning_rate': '1.432e-06', 'ppl': '1.641', 'memory/max_active (GiB)': '72.62', 'memory/max_allocated (GiB)': '72.62', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '79.26', 'tokens/total': 2778699776, 'tokens/trainable': 1027422272, 'epoch': '2.509'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                           | 1465/1751 [24:28:25<4:49:13, 60.68s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                           | 1466/1751 [24:29:25<4:47:47, 60.59s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5194', 'grad_norm': '0.1709', 'learning_rate': '1.423e-06', 'ppl': '1.681', 'memory/max_active (GiB)': '74.15', 'memory/max_allocated (GiB)': '74.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '48.28', 'tokens/total': 2780671744, 'tokens/trainable': 1028170816, 'epoch': '2.511'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                           | 1466/1751 [24:29:25<4:47:47, 60.59s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                           | 1467/1751 [24:30:24<4:44:33, 60.12s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5184', 'grad_norm': '0.1777', 'learning_rate': '1.413e-06', 'ppl': '1.679', 'memory/max_active (GiB)': '70.95', 'memory/max_allocated (GiB)': '70.95', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '79.06', 'tokens/total': 2782614784, 'tokens/trainable': 1028922624, 'epoch': '2.512'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                           | 1467/1751 [24:30:24<4:44:33, 60.12s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                           | 1468/1751 [24:31:22<4:40:25, 59.46s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5347', 'grad_norm': '0.1963', 'learning_rate': '1.403e-06', 'ppl': '1.707', 'memory/max_active (GiB)': '72.46', 'memory/max_allocated (GiB)': '72.46', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '67.33', 'tokens/total': 2784450048, 'tokens/trainable': 1029558144, 'epoch': '2.514'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                           | 1468/1751 [24:31:22<4:40:25, 59.46s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 1469/1751 [24:32:20<4:38:14, 59.20s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5295', 'grad_norm': '0.1797', 'learning_rate': '1.394e-06', 'ppl': '1.698', 'memory/max_active (GiB)': '75.34', 'memory/max_allocated (GiB)': '75.34', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '115', 'tokens/total': 2786281216, 'tokens/trainable': 1030239744, 'epoch': '2.516'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 1469/1751 [24:32:20<4:38:14, 59.20s/it] 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                           | 1470/1751 [24:33:20<4:38:10, 59.40s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5388', 'grad_norm': '0.1846', 'learning_rate': '1.384e-06', 'ppl': '1.714', 'memory/max_active (GiB)': '74.28', 'memory/max_allocated (GiB)': '74.28', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '113.8', 'tokens/total': 2788189696, 'tokens/trainable': 1030921216, 'epoch': '2.518'}
 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                           | 1470/1751 [24:33:20<4:38:10, 59.40s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                          | 1471/1751 [24:34:20<4:37:55, 59.55s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4983', 'grad_norm': '0.165', 'learning_rate': '1.375e-06', 'ppl': '1.646', 'memory/max_active (GiB)': '73.64', 'memory/max_allocated (GiB)': '73.64', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '221.7', 'tokens/total': 2790087936, 'tokens/trainable': 1031626688, 'epoch': '2.519'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                          | 1471/1751 [24:34:20<4:37:55, 59.55s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 1472/1751 [24:35:19<4:35:41, 59.29s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5191', 'grad_norm': '0.1895', 'learning_rate': '1.365e-06', 'ppl': '1.681', 'memory/max_active (GiB)': '76.43', 'memory/max_allocated (GiB)': '76.43', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '10.06', 'tokens/total': 2791878400, 'tokens/trainable': 1032301248, 'epoch': '2.521'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 1472/1751 [24:35:19<4:35:41, 59.29s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                          | 1473/1751 [24:36:18<4:34:09, 59.17s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5207', 'grad_norm': '0.1777', 'learning_rate': '1.356e-06', 'ppl': '1.683', 'memory/max_active (GiB)': '71.88', 'memory/max_allocated (GiB)': '71.88', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '41.92', 'tokens/total': 2793712128, 'tokens/trainable': 1032991488, 'epoch': '2.523'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                          | 1473/1751 [24:36:18<4:34:09, 59.17s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                          | 1474/1751 [24:37:16<4:32:31, 59.03s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5653', 'grad_norm': '0.1943', 'learning_rate': '1.346e-06', 'ppl': '1.76', 'memory/max_active (GiB)': '74.45', 'memory/max_allocated (GiB)': '74.45', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '84.41', 'tokens/total': 2795550464, 'tokens/trainable': 1033663104, 'epoch': '2.524'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                          | 1474/1751 [24:37:16<4:32:31, 59.03s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                          | 1475/1751 [24:38:12<4:26:36, 57.96s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.6215', 'grad_norm': '0.1963', 'learning_rate': '1.337e-06', 'ppl': '1.862', 'memory/max_active (GiB)': '75.81', 'memory/max_allocated (GiB)': '75.81', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '65.27', 'tokens/total': 2797282560, 'tokens/trainable': 1034278720, 'epoch': '2.526'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                          | 1475/1751 [24:38:12<4:26:36, 57.96s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                          | 1476/1751 [24:39:13<4:30:00, 58.91s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5407', 'grad_norm': '0.1885', 'learning_rate': '1.327e-06', 'ppl': '1.717', 'memory/max_active (GiB)': '72.34', 'memory/max_allocated (GiB)': '72.34', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '151.2', 'tokens/total': 2799181568, 'tokens/trainable': 1034979072, 'epoch': '2.528'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                          | 1476/1751 [24:39:13<4:30:00, 58.91s/it] 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                          | 1477/1751 [24:40:14<4:31:44, 59.50s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4987', 'grad_norm': '0.1758', 'learning_rate': '1.318e-06', 'ppl': '1.647', 'memory/max_active (GiB)': '74.36', 'memory/max_allocated (GiB)': '74.36', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '160.8', 'tokens/total': 2801136128, 'tokens/trainable': 1035663232, 'epoch': '2.529'}
 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                          | 1477/1751 [24:40:14<4:31:44, 59.50s/it] 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                         | 1478/1751 [24:41:16<4:34:46, 60.39s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5196', 'grad_norm': '0.1699', 'learning_rate': '1.308e-06', 'ppl': '1.681', 'memory/max_active (GiB)': '74.55', 'memory/max_allocated (GiB)': '74.55', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '20.8', 'tokens/total': 2803156736, 'tokens/trainable': 1036409088, 'epoch': '2.531'}
 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                         | 1478/1751 [24:41:16<4:34:46, 60.39s/it] 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                         | 1479/1751 [24:42:19<4:36:17, 60.95s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5031', 'grad_norm': '0.1748', 'learning_rate': '1.299e-06', 'ppl': '1.654', 'memory/max_active (GiB)': '76.27', 'memory/max_allocated (GiB)': '76.27', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '205.6', 'tokens/total': 2805149184, 'tokens/trainable': 1037175040, 'epoch': '2.533'}
 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                         | 1479/1751 [24:42:19<4:36:17, 60.95s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                         | 1480/1751 [24:43:19<4:34:47, 60.84s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5015', 'grad_norm': '0.1758', 'learning_rate': '1.29e-06', 'ppl': '1.651', 'memory/max_active (GiB)': '76.39', 'memory/max_allocated (GiB)': '76.39', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '181.6', 'tokens/total': 2807082496, 'tokens/trainable': 1037894976, 'epoch': '2.535'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                         | 1480/1751 [24:43:19<4:34:47, 60.84s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                         | 1481/1751 [24:44:21<4:35:35, 61.24s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4921', 'grad_norm': '0.1719', 'learning_rate': '1.281e-06', 'ppl': '1.636', 'memory/max_active (GiB)': '74.8', 'memory/max_allocated (GiB)': '74.8', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '111.3', 'tokens/total': 2809071872, 'tokens/trainable': 1038635008, 'epoch': '2.536'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                         | 1481/1751 [24:44:21<4:35:35, 61.24s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                         | 1482/1751 [24:45:22<4:33:56, 61.10s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5305', 'grad_norm': '0.1777', 'learning_rate': '1.271e-06', 'ppl': '1.7', 'memory/max_active (GiB)': '76.99', 'memory/max_allocated (GiB)': '76.99', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '160', 'tokens/total': 2811002624, 'tokens/trainable': 1039329600, 'epoch': '2.538'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                         | 1482/1751 [24:45:22<4:33:56, 61.10s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                         | 1483/1751 [24:46:23<4:32:36, 61.03s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5118', 'grad_norm': '0.1729', 'learning_rate': '1.262e-06', 'ppl': '1.668', 'memory/max_active (GiB)': '75.72', 'memory/max_allocated (GiB)': '75.72', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '133', 'tokens/total': 2812938496, 'tokens/trainable': 1040047232, 'epoch': '2.54'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                         | 1483/1751 [24:46:23<4:32:36, 61.03s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                        | 1484/1751 [24:47:25<4:32:56, 61.33s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5048', 'grad_norm': '0.166', 'learning_rate': '1.253e-06', 'ppl': '1.657', 'memory/max_active (GiB)': '75.16', 'memory/max_allocated (GiB)': '75.16', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '128.6', 'tokens/total': 2814919680, 'tokens/trainable': 1040796416, 'epoch': '2.541'}
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                        | 1484/1751 [24:47:25<4:32:56, 61.33s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 1485/1751 [24:48:25<4:30:34, 61.03s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5038', 'grad_norm': '0.1846', 'learning_rate': '1.244e-06', 'ppl': '1.655', 'memory/max_active (GiB)': '71.29', 'memory/max_allocated (GiB)': '71.29', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '73.9', 'tokens/total': 2816795392, 'tokens/trainable': 1041491584, 'epoch': '2.543'}
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 1485/1751 [24:48:25<4:30:34, 61.03s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                        | 1486/1751 [24:49:27<4:29:52, 61.10s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5064', 'grad_norm': '0.1855', 'learning_rate': '1.235e-06', 'ppl': '1.659', 'memory/max_active (GiB)': '73.22', 'memory/max_allocated (GiB)': '73.22', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '161.4', 'tokens/total': 2818768384, 'tokens/trainable': 1042223872, 'epoch': '2.545'}
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                        | 1486/1751 [24:49:27<4:29:52, 61.10s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                        | 1487/1751 [24:50:26<4:25:51, 60.42s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5438', 'grad_norm': '0.1816', 'learning_rate': '1.226e-06', 'ppl': '1.722', 'memory/max_active (GiB)': '69.48', 'memory/max_allocated (GiB)': '69.48', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '81.66', 'tokens/total': 2820589312, 'tokens/trainable': 1042904000, 'epoch': '2.547'}
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                        | 1487/1751 [24:50:26<4:25:51, 60.42s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                        | 1488/1751 [24:51:29<4:29:12, 61.42s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4693', 'grad_norm': '0.1641', 'learning_rate': '1.217e-06', 'ppl': '1.599', 'memory/max_active (GiB)': '78.47', 'memory/max_allocated (GiB)': '78.47', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '149.5', 'tokens/total': 2822598400, 'tokens/trainable': 1043674112, 'epoch': '2.548'}
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                        | 1488/1751 [24:51:29<4:29:12, 61.42s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                        | 1489/1751 [24:52:28<4:25:15, 60.75s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.497', 'grad_norm': '0.1719', 'learning_rate': '1.208e-06', 'ppl': '1.644', 'memory/max_active (GiB)': '71.49', 'memory/max_allocated (GiB)': '71.49', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '207.4', 'tokens/total': 2824462336, 'tokens/trainable': 1044359872, 'epoch': '2.55'}
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                        | 1489/1751 [24:52:28<4:25:15, 60.75s/it] 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 1490/1751 [24:53:30<4:25:30, 61.04s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4821', 'grad_norm': '0.1699', 'learning_rate': '1.199e-06', 'ppl': '1.62', 'memory/max_active (GiB)': '75.25', 'memory/max_allocated (GiB)': '75.25', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '87.61', 'tokens/total': 2826400000, 'tokens/trainable': 1045095232, 'epoch': '2.552'}
 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 1490/1751 [24:53:30<4:25:30, 61.04s/it] 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                       | 1491/1751 [24:54:30<4:22:35, 60.60s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5068', 'grad_norm': '0.1738', 'learning_rate': '1.19e-06', 'ppl': '1.66', 'memory/max_active (GiB)': '76.71', 'memory/max_allocated (GiB)': '76.71', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '104.5', 'tokens/total': 2828286976, 'tokens/trainable': 1045796608, 'epoch': '2.553'}
 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                       | 1491/1751 [24:54:30<4:22:35, 60.60s/it] 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                       | 1492/1751 [24:55:32<4:23:06, 60.95s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5247', 'grad_norm': '0.1748', 'learning_rate': '1.181e-06', 'ppl': '1.69', 'memory/max_active (GiB)': '76.77', 'memory/max_allocated (GiB)': '76.77', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '39.49', 'tokens/total': 2830222336, 'tokens/trainable': 1046507584, 'epoch': '2.555'}
 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                       | 1492/1751 [24:55:32<4:23:06, 60.95s/it] 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                       | 1493/1751 [24:56:34<4:24:23, 61.49s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5194', 'grad_norm': '0.1729', 'learning_rate': '1.172e-06', 'ppl': '1.681', 'memory/max_active (GiB)': '72.42', 'memory/max_allocated (GiB)': '72.42', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '36.46', 'tokens/total': 2832228608, 'tokens/trainable': 1047235008, 'epoch': '2.557'}
 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                       | 1493/1751 [24:56:34<4:24:23, 61.49s/it] 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                       | 1494/1751 [24:57:33<4:19:53, 60.67s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5328', 'grad_norm': '0.1807', 'learning_rate': '1.163e-06', 'ppl': '1.704', 'memory/max_active (GiB)': '71.67', 'memory/max_allocated (GiB)': '71.67', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '12.87', 'tokens/total': 2834113280, 'tokens/trainable': 1047915968, 'epoch': '2.559'}
 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                       | 1494/1751 [24:57:33<4:19:53, 60.67s/it] 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                       | 1495/1751 [24:58:32<4:16:30, 60.12s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4999', 'grad_norm': '0.1787', 'learning_rate': '1.154e-06', 'ppl': '1.649', 'memory/max_active (GiB)': '72.64', 'memory/max_allocated (GiB)': '72.64', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '109.7', 'tokens/total': 2835995904, 'tokens/trainable': 1048602880, 'epoch': '2.56'}
 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                       | 1495/1751 [24:58:32<4:16:30, 60.12s/it] 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                       | 1496/1751 [24:59:33<4:16:38, 60.39s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5208', 'grad_norm': '0.1738', 'learning_rate': '1.145e-06', 'ppl': '1.683', 'memory/max_active (GiB)': '74.19', 'memory/max_allocated (GiB)': '74.19', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '36.28', 'tokens/total': 2837963008, 'tokens/trainable': 1049344960, 'epoch': '2.562'}
 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                       | 1496/1751 [24:59:33<4:16:38, 60.39s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                       | 1497/1751 [25:00:31<4:13:17, 59.83s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.516', 'grad_norm': '0.1748', 'learning_rate': '1.137e-06', 'ppl': '1.675', 'memory/max_active (GiB)': '70.99', 'memory/max_allocated (GiB)': '70.99', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '82.72', 'tokens/total': 2839871232, 'tokens/trainable': 1050032832, 'epoch': '2.564'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                       | 1497/1751 [25:00:31<4:13:17, 59.83s/it] 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                      | 1498/1751 [25:01:31<4:11:28, 59.64s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5561', 'grad_norm': '0.1895', 'learning_rate': '1.128e-06', 'ppl': '1.744', 'memory/max_active (GiB)': '75.68', 'memory/max_allocated (GiB)': '75.68', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '119.6', 'tokens/total': 2841725952, 'tokens/trainable': 1050743040, 'epoch': '2.565'}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                      | 1498/1751 [25:01:31<4:11:28, 59.64s/it] 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                      | 1499/1751 [25:02:31<4:12:01, 60.01s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4863', 'grad_norm': '0.1816', 'learning_rate': '1.119e-06', 'ppl': '1.626', 'memory/max_active (GiB)': '70.76', 'memory/max_allocated (GiB)': '70.76', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '101.5', 'tokens/total': 2843675136, 'tokens/trainable': 1051464512, 'epoch': '2.567'}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                      | 1499/1751 [25:02:31<4:12:01, 60.01s/it] 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                      | 1500/1751 [25:03:35<4:15:02, 60.96s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4754', 'grad_norm': '0.165', 'learning_rate': '1.111e-06', 'ppl': '1.609', 'memory/max_active (GiB)': '75.58', 'memory/max_allocated (GiB)': '75.58', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '122.5', 'tokens/total': 2845713664, 'tokens/trainable': 1052196608, 'epoch': '2.569'}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                      | 1500/1751 [25:03:35<4:15:02, 60.96s/it] 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                      | 1501/1751 [25:04:35<4:12:38, 60.63s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5064', 'grad_norm': '0.1777', 'learning_rate': '1.102e-06', 'ppl': '1.659', 'memory/max_active (GiB)': '74.55', 'memory/max_allocated (GiB)': '74.55', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '43.6', 'tokens/total': 2847586560, 'tokens/trainable': 1052895552, 'epoch': '2.571'}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                      | 1501/1751 [25:04:35<4:12:38, 60.63s/it] 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                      | 1502/1751 [25:05:34<4:09:47, 60.19s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5219', 'grad_norm': '0.1914', 'learning_rate': '1.093e-06', 'ppl': '1.685', 'memory/max_active (GiB)': '76.33', 'memory/max_allocated (GiB)': '76.33', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '56.67', 'tokens/total': 2849424896, 'tokens/trainable': 1053568704, 'epoch': '2.572'}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                      | 1502/1751 [25:05:34<4:09:47, 60.19s/it] 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                      | 1503/1751 [25:06:38<4:14:30, 61.57s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4994', 'grad_norm': '0.1846', 'learning_rate': '1.085e-06', 'ppl': '1.648', 'memory/max_active (GiB)': '74.04', 'memory/max_allocated (GiB)': '74.04', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '148.6', 'tokens/total': 2851510016, 'tokens/trainable': 1054318528, 'epoch': '2.574'}
 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                      | 1503/1751 [25:06:38<4:14:30, 61.57s/it] 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 1504/1751 [25:07:37<4:10:06, 60.75s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.51', 'grad_norm': '0.1807', 'learning_rate': '1.076e-06', 'ppl': '1.665', 'memory/max_active (GiB)': '70.23', 'memory/max_allocated (GiB)': '70.23', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '90.07', 'tokens/total': 2853323520, 'tokens/trainable': 1054991808, 'epoch': '2.576'}
 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 1504/1751 [25:07:37<4:10:06, 60.75s/it] 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                     | 1505/1751 [25:08:40<4:11:27, 61.33s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4929', 'grad_norm': '0.1572', 'learning_rate': '1.068e-06', 'ppl': '1.637', 'memory/max_active (GiB)': '76.66', 'memory/max_allocated (GiB)': '76.66', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '163.5', 'tokens/total': 2855311616, 'tokens/trainable': 1055742976, 'epoch': '2.577'}
 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                     | 1505/1751 [25:08:40<4:11:27, 61.33s/it] 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                     | 1506/1751 [25:09:42<4:11:02, 61.48s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5', 'grad_norm': '0.1719', 'learning_rate': '1.059e-06', 'ppl': '1.649', 'memory/max_active (GiB)': '74.25', 'memory/max_allocated (GiB)': '74.25', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '23.03', 'tokens/total': 2857250304, 'tokens/trainable': 1056464576, 'epoch': '2.579'}
 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                     | 1506/1751 [25:09:42<4:11:02, 61.48s/it] 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                     | 1507/1751 [25:10:42<4:08:37, 61.14s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5168', 'grad_norm': '0.1787', 'learning_rate': '1.051e-06', 'ppl': '1.677', 'memory/max_active (GiB)': '75.32', 'memory/max_allocated (GiB)': '75.32', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '47.07', 'tokens/total': 2859132672, 'tokens/trainable': 1057136256, 'epoch': '2.581'}
 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                     | 1507/1751 [25:10:42<4:08:37, 61.14s/it] 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                     | 1508/1751 [25:11:42<4:06:02, 60.75s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4977', 'grad_norm': '0.1797', 'learning_rate': '1.042e-06', 'ppl': '1.645', 'memory/max_active (GiB)': '71.26', 'memory/max_allocated (GiB)': '71.26', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '137.8', 'tokens/total': 2861028608, 'tokens/trainable': 1057828672, 'epoch': '2.583'}
 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                     | 1508/1751 [25:11:42<4:06:02, 60.75s/it] 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 1509/1751 [25:12:42<4:04:14, 60.56s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5406', 'grad_norm': '0.1758', 'learning_rate': '1.034e-06', 'ppl': '1.717', 'memory/max_active (GiB)': '69.37', 'memory/max_allocated (GiB)': '69.37', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '56.35', 'tokens/total': 2862920960, 'tokens/trainable': 1058526400, 'epoch': '2.584'}
 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 1509/1751 [25:12:42<4:04:14, 60.56s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                     | 1510/1751 [25:13:42<4:02:06, 60.28s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5402', 'grad_norm': '0.1885', 'learning_rate': '1.026e-06', 'ppl': '1.716', 'memory/max_active (GiB)': '74.12', 'memory/max_allocated (GiB)': '74.12', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '47.35', 'tokens/total': 2864780544, 'tokens/trainable': 1059192000, 'epoch': '2.586'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                     | 1510/1751 [25:13:42<4:02:06, 60.28s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                    | 1511/1751 [25:14:43<4:02:42, 60.68s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5095', 'grad_norm': '0.168', 'learning_rate': '1.017e-06', 'ppl': '1.665', 'memory/max_active (GiB)': '74.13', 'memory/max_allocated (GiB)': '74.13', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '174.9', 'tokens/total': 2866726912, 'tokens/trainable': 1059944768, 'epoch': '2.588'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                    | 1511/1751 [25:14:43<4:02:42, 60.68s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                    | 1512/1751 [25:15:41<3:57:58, 59.74s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5363', 'grad_norm': '0.1895', 'learning_rate': '1.009e-06', 'ppl': '1.71', 'memory/max_active (GiB)': '75.78', 'memory/max_allocated (GiB)': '75.78', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '76.07', 'tokens/total': 2868529408, 'tokens/trainable': 1060598336, 'epoch': '2.589'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                    | 1512/1751 [25:15:41<3:57:58, 59.74s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                    | 1513/1751 [25:16:40<3:56:32, 59.63s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5083', 'grad_norm': '0.1826', 'learning_rate': '1.001e-06', 'ppl': '1.663', 'memory/max_active (GiB)': '73.74', 'memory/max_allocated (GiB)': '73.74', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '121.2', 'tokens/total': 2870422272, 'tokens/trainable': 1061286080, 'epoch': '2.591'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                    | 1513/1751 [25:16:40<3:56:32, 59.63s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                    | 1514/1751 [25:17:43<3:59:34, 60.65s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.507', 'grad_norm': '0.1729', 'learning_rate': '9.927e-07', 'ppl': '1.66', 'memory/max_active (GiB)': '72.74', 'memory/max_allocated (GiB)': '72.74', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '88.67', 'tokens/total': 2872412416, 'tokens/trainable': 1062025664, 'epoch': '2.593'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                    | 1514/1751 [25:17:43<3:59:34, 60.65s/it] 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                    | 1515/1751 [25:18:43<3:57:54, 60.48s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5391', 'grad_norm': '0.1846', 'learning_rate': '9.845e-07', 'ppl': '1.715', 'memory/max_active (GiB)': '73.31', 'memory/max_allocated (GiB)': '73.31', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '32.26', 'tokens/total': 2874292736, 'tokens/trainable': 1062694080, 'epoch': '2.595'}
 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                    | 1515/1751 [25:18:43<3:57:54, 60.48s/it] 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                    | 1516/1751 [25:19:42<3:55:04, 60.02s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5159', 'grad_norm': '0.1826', 'learning_rate': '9.763e-07', 'ppl': '1.675', 'memory/max_active (GiB)': '77.25', 'memory/max_allocated (GiB)': '77.25', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '54.98', 'tokens/total': 2876127744, 'tokens/trainable': 1063374848, 'epoch': '2.596'}
 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                    | 1516/1751 [25:19:42<3:55:04, 60.02s/it] 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                   | 1517/1751 [25:20:44<3:56:15, 60.58s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5238', 'grad_norm': '0.1689', 'learning_rate': '9.682e-07', 'ppl': '1.688', 'memory/max_active (GiB)': '75.76', 'memory/max_allocated (GiB)': '75.76', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '43.75', 'tokens/total': 2878145280, 'tokens/trainable': 1064101824, 'epoch': '2.598'}
 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                   | 1517/1751 [25:20:44<3:56:15, 60.58s/it] 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                   | 1518/1751 [25:21:44<3:54:08, 60.29s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5086', 'grad_norm': '0.2656', 'learning_rate': '9.601e-07', 'ppl': '1.663', 'memory/max_active (GiB)': '77.15', 'memory/max_allocated (GiB)': '77.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '147.2', 'tokens/total': 2880040448, 'tokens/trainable': 1064773376, 'epoch': '2.6'}
 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                   | 1518/1751 [25:21:44<3:54:08, 60.29s/it] 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                   | 1519/1751 [25:22:43<3:52:02, 60.01s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4979', 'grad_norm': '0.1738', 'learning_rate': '9.521e-07', 'ppl': '1.645', 'memory/max_active (GiB)': '66.41', 'memory/max_allocated (GiB)': '66.41', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '64.1', 'tokens/total': 2881923328, 'tokens/trainable': 1065479232, 'epoch': '2.601'}
 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                   | 1519/1751 [25:22:43<3:52:02, 60.01s/it] 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                   | 1520/1751 [25:23:43<3:51:01, 60.01s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5171', 'grad_norm': '0.1787', 'learning_rate': '9.44e-07', 'ppl': '1.677', 'memory/max_active (GiB)': '77.05', 'memory/max_allocated (GiB)': '77.05', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '52.22', 'tokens/total': 2883812864, 'tokens/trainable': 1066170752, 'epoch': '2.603'}
 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                   | 1520/1751 [25:23:43<3:51:01, 60.01s/it] 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                   | 1521/1751 [25:24:41<3:47:50, 59.44s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5178', 'grad_norm': '0.1836', 'learning_rate': '9.36e-07', 'ppl': '1.678', 'memory/max_active (GiB)': '74.41', 'memory/max_allocated (GiB)': '74.41', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '181.4', 'tokens/total': 2885625088, 'tokens/trainable': 1066844032, 'epoch': '2.605'}
 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                   | 1521/1751 [25:24:41<3:47:50, 59.44s/it] 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                   | 1522/1751 [25:25:42<3:47:44, 59.67s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5209', 'grad_norm': '0.1777', 'learning_rate': '9.281e-07', 'ppl': '1.684', 'memory/max_active (GiB)': '70.17', 'memory/max_allocated (GiB)': '70.17', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '220.9', 'tokens/total': 2887529472, 'tokens/trainable': 1067549824, 'epoch': '2.607'}
 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                   | 1522/1751 [25:25:42<3:47:44, 59.67s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                   | 1523/1751 [25:26:42<3:47:16, 59.81s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5118', 'grad_norm': '0.1787', 'learning_rate': '9.201e-07', 'ppl': '1.668', 'memory/max_active (GiB)': '75.97', 'memory/max_allocated (GiB)': '75.97', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '44.13', 'tokens/total': 2889396480, 'tokens/trainable': 1068253568, 'epoch': '2.608'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                   | 1523/1751 [25:26:42<3:47:16, 59.81s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                  | 1524/1751 [25:27:42<3:47:08, 60.04s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4919', 'grad_norm': '0.1729', 'learning_rate': '9.123e-07', 'ppl': '1.635', 'memory/max_active (GiB)': '71.2', 'memory/max_allocated (GiB)': '71.2', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '47.96', 'tokens/total': 2891340288, 'tokens/trainable': 1068975488, 'epoch': '2.61'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                  | 1524/1751 [25:27:42<3:47:08, 60.04s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 1525/1751 [25:28:42<3:45:19, 59.82s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5215', 'grad_norm': '0.1777', 'learning_rate': '9.044e-07', 'ppl': '1.684', 'memory/max_active (GiB)': '70.84', 'memory/max_allocated (GiB)': '70.84', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '61.99', 'tokens/total': 2893237760, 'tokens/trainable': 1069678784, 'epoch': '2.612'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 1525/1751 [25:28:42<3:45:19, 59.82s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 1526/1751 [25:29:41<3:44:03, 59.75s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5291', 'grad_norm': '0.1875', 'learning_rate': '8.966e-07', 'ppl': '1.697', 'memory/max_active (GiB)': '75.29', 'memory/max_allocated (GiB)': '75.29', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '85.05', 'tokens/total': 2895122688, 'tokens/trainable': 1070358912, 'epoch': '2.613'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 1526/1751 [25:29:41<3:44:03, 59.75s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                  | 1527/1751 [25:30:42<3:44:27, 60.12s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4654', 'grad_norm': '0.165', 'learning_rate': '8.888e-07', 'ppl': '1.593', 'memory/max_active (GiB)': '74.25', 'memory/max_allocated (GiB)': '74.25', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '36.67', 'tokens/total': 2897086720, 'tokens/trainable': 1071096128, 'epoch': '2.615'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                  | 1527/1751 [25:30:42<3:44:27, 60.12s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 1528/1751 [25:31:43<3:44:46, 60.48s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5059', 'grad_norm': '0.168', 'learning_rate': '8.81e-07', 'ppl': '1.658', 'memory/max_active (GiB)': '73.42', 'memory/max_allocated (GiB)': '73.42', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '90.58', 'tokens/total': 2899038976, 'tokens/trainable': 1071809536, 'epoch': '2.617'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 1528/1751 [25:31:43<3:44:46, 60.48s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                  | 1529/1751 [25:32:44<3:44:14, 60.61s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5059', 'grad_norm': '0.1826', 'learning_rate': '8.733e-07', 'ppl': '1.658', 'memory/max_active (GiB)': '70.79', 'memory/max_allocated (GiB)': '70.79', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '75.84', 'tokens/total': 2900979968, 'tokens/trainable': 1072524544, 'epoch': '2.619'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                  | 1529/1751 [25:32:44<3:44:14, 60.61s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                 | 1530/1751 [25:33:45<3:42:58, 60.54s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5042', 'grad_norm': '0.1719', 'learning_rate': '8.656e-07', 'ppl': '1.656', 'memory/max_active (GiB)': '75.17', 'memory/max_allocated (GiB)': '75.17', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '61.34', 'tokens/total': 2902889984, 'tokens/trainable': 1073218304, 'epoch': '2.62'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                 | 1530/1751 [25:33:45<3:42:58, 60.54s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                 | 1531/1751 [25:34:47<3:43:22, 60.92s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5132', 'grad_norm': '0.1719', 'learning_rate': '8.579e-07', 'ppl': '1.671', 'memory/max_active (GiB)': '72.04', 'memory/max_allocated (GiB)': '72.04', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '39.52', 'tokens/total': 2904852992, 'tokens/trainable': 1073949696, 'epoch': '2.622'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                 | 1531/1751 [25:34:47<3:43:22, 60.92s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                 | 1532/1751 [25:35:48<3:42:49, 61.05s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4746', 'grad_norm': '0.1709', 'learning_rate': '8.503e-07', 'ppl': '1.607', 'memory/max_active (GiB)': '70.61', 'memory/max_allocated (GiB)': '70.61', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '52.88', 'tokens/total': 2906801408, 'tokens/trainable': 1074651136, 'epoch': '2.624'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                 | 1532/1751 [25:35:48<3:42:49, 61.05s/it] 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                 | 1533/1751 [25:36:44<3:36:52, 59.69s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5601', 'grad_norm': '0.1875', 'learning_rate': '8.427e-07', 'ppl': '1.751', 'memory/max_active (GiB)': '74.47', 'memory/max_allocated (GiB)': '74.47', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '43.98', 'tokens/total': 2908527872, 'tokens/trainable': 1075311744, 'epoch': '2.625'}
 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                 | 1533/1751 [25:36:44<3:36:52, 59.69s/it] 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                 | 1534/1751 [25:37:43<3:35:14, 59.51s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5321', 'grad_norm': '0.1768', 'learning_rate': '8.351e-07', 'ppl': '1.702', 'memory/max_active (GiB)': '75.29', 'memory/max_allocated (GiB)': '75.29', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '104.6', 'tokens/total': 2910355968, 'tokens/trainable': 1076005248, 'epoch': '2.627'}
 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                 | 1534/1751 [25:37:43<3:35:14, 59.51s/it] 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                 | 1535/1751 [25:38:41<3:32:16, 58.96s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5067', 'grad_norm': '0.1758', 'learning_rate': '8.276e-07', 'ppl': '1.66', 'memory/max_active (GiB)': '76.59', 'memory/max_allocated (GiB)': '76.59', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '35.01', 'tokens/total': 2912178432, 'tokens/trainable': 1076686336, 'epoch': '2.629'}
 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                 | 1535/1751 [25:38:41<3:32:16, 58.96s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                 | 1536/1751 [25:39:39<3:30:15, 58.68s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5279', 'grad_norm': '0.1777', 'learning_rate': '8.201e-07', 'ppl': '1.695', 'memory/max_active (GiB)': '73.4', 'memory/max_allocated (GiB)': '73.4', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '13.99', 'tokens/total': 2913972480, 'tokens/trainable': 1077375872, 'epoch': '2.631'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                 | 1536/1751 [25:39:39<3:30:15, 58.68s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                | 1537/1751 [25:40:41<3:32:40, 59.63s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4971', 'grad_norm': '0.1641', 'learning_rate': '8.126e-07', 'ppl': '1.644', 'memory/max_active (GiB)': '73.54', 'memory/max_allocated (GiB)': '73.54', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '128.3', 'tokens/total': 2915929088, 'tokens/trainable': 1078089856, 'epoch': '2.632'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                | 1537/1751 [25:40:41<3:32:40, 59.63s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                | 1538/1751 [25:41:45<3:36:01, 60.85s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4816', 'grad_norm': '0.1621', 'learning_rate': '8.051e-07', 'ppl': '1.619', 'memory/max_active (GiB)': '78.15', 'memory/max_allocated (GiB)': '78.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '113.5', 'tokens/total': 2918010112, 'tokens/trainable': 1078841344, 'epoch': '2.634'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                | 1538/1751 [25:41:45<3:36:01, 60.85s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 1539/1751 [25:42:46<3:34:58, 60.84s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5592', 'grad_norm': '0.1846', 'learning_rate': '7.977e-07', 'ppl': '1.749', 'memory/max_active (GiB)': '74.62', 'memory/max_allocated (GiB)': '74.62', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '47.81', 'tokens/total': 2919869952, 'tokens/trainable': 1079539456, 'epoch': '2.636'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 1539/1751 [25:42:46<3:34:58, 60.84s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                | 1540/1751 [25:43:45<3:32:14, 60.35s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5469', 'grad_norm': '0.1846', 'learning_rate': '7.904e-07', 'ppl': '1.728', 'memory/max_active (GiB)': '77.45', 'memory/max_allocated (GiB)': '77.45', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '66.68', 'tokens/total': 2921741568, 'tokens/trainable': 1080248320, 'epoch': '2.637'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                | 1540/1751 [25:43:45<3:32:14, 60.35s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                | 1541/1751 [25:44:44<3:30:06, 60.03s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4865', 'grad_norm': '0.1709', 'learning_rate': '7.83e-07', 'ppl': '1.627', 'memory/max_active (GiB)': '72.18', 'memory/max_allocated (GiB)': '72.18', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '98.03', 'tokens/total': 2923615744, 'tokens/trainable': 1080937216, 'epoch': '2.639'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                | 1541/1751 [25:44:44<3:30:06, 60.03s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                | 1542/1751 [25:45:44<3:28:37, 59.89s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5133', 'grad_norm': '0.1787', 'learning_rate': '7.757e-07', 'ppl': '1.671', 'memory/max_active (GiB)': '75.9', 'memory/max_allocated (GiB)': '75.9', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '119.2', 'tokens/total': 2925469696, 'tokens/trainable': 1081610368, 'epoch': '2.641'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                | 1542/1751 [25:45:44<3:28:37, 59.89s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                               | 1543/1751 [25:46:43<3:27:24, 59.83s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5564', 'grad_norm': '0.1816', 'learning_rate': '7.684e-07', 'ppl': '1.744', 'memory/max_active (GiB)': '73.73', 'memory/max_allocated (GiB)': '73.73', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '52.54', 'tokens/total': 2927356928, 'tokens/trainable': 1082311296, 'epoch': '2.643'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                               | 1543/1751 [25:46:43<3:27:24, 59.83s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                               | 1544/1751 [25:47:43<3:25:51, 59.67s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5202', 'grad_norm': '0.1768', 'learning_rate': '7.612e-07', 'ppl': '1.682', 'memory/max_active (GiB)': '76.46', 'memory/max_allocated (GiB)': '76.46', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '139.6', 'tokens/total': 2929241856, 'tokens/trainable': 1083009536, 'epoch': '2.644'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                               | 1544/1751 [25:47:43<3:25:51, 59.67s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                               | 1545/1751 [25:48:45<3:27:27, 60.43s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5324', 'grad_norm': '0.1758', 'learning_rate': '7.54e-07', 'ppl': '1.703', 'memory/max_active (GiB)': '76.33', 'memory/max_allocated (GiB)': '76.33', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '181', 'tokens/total': 2931223808, 'tokens/trainable': 1083751040, 'epoch': '2.646'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                               | 1545/1751 [25:48:45<3:27:27, 60.43s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                               | 1546/1751 [25:49:45<3:25:53, 60.26s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5234', 'grad_norm': '0.1797', 'learning_rate': '7.468e-07', 'ppl': '1.688', 'memory/max_active (GiB)': '74.92', 'memory/max_allocated (GiB)': '74.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '34.6', 'tokens/total': 2933132800, 'tokens/trainable': 1084453248, 'epoch': '2.648'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                               | 1546/1751 [25:49:45<3:25:53, 60.26s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 1547/1751 [25:50:41<3:21:24, 59.24s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5384', 'grad_norm': '0.1836', 'learning_rate': '7.397e-07', 'ppl': '1.713', 'memory/max_active (GiB)': '71.04', 'memory/max_allocated (GiB)': '71.04', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '171.3', 'tokens/total': 2934922240, 'tokens/trainable': 1085124352, 'epoch': '2.649'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 1547/1751 [25:50:41<3:21:24, 59.24s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                               | 1548/1751 [25:51:40<3:19:15, 58.89s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5175', 'grad_norm': '0.1826', 'learning_rate': '7.326e-07', 'ppl': '1.678', 'memory/max_active (GiB)': '74.9', 'memory/max_allocated (GiB)': '74.9', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '91.98', 'tokens/total': 2936792576, 'tokens/trainable': 1085815552, 'epoch': '2.651'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                               | 1548/1751 [25:51:40<3:19:15, 58.89s/it] 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                               | 1549/1751 [25:52:40<3:19:23, 59.23s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5474', 'grad_norm': '0.1826', 'learning_rate': '7.255e-07', 'ppl': '1.729', 'memory/max_active (GiB)': '75.39', 'memory/max_allocated (GiB)': '75.39', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '57', 'tokens/total': 2938685952, 'tokens/trainable': 1086518528, 'epoch': '2.653'}
 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                               | 1549/1751 [25:52:40<3:19:23, 59.23s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                              | 1550/1751 [25:53:42<3:21:13, 60.06s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5188', 'grad_norm': '0.1729', 'learning_rate': '7.184e-07', 'ppl': '1.68', 'memory/max_active (GiB)': '73.92', 'memory/max_allocated (GiB)': '73.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '206.2', 'tokens/total': 2940668672, 'tokens/trainable': 1087261440, 'epoch': '2.655'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                              | 1550/1751 [25:53:42<3:21:13, 60.06s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                              | 1551/1751 [25:54:42<3:20:49, 60.25s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5043', 'grad_norm': '0.1719', 'learning_rate': '7.114e-07', 'ppl': '1.656', 'memory/max_active (GiB)': '76.34', 'memory/max_allocated (GiB)': '76.34', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '88.05', 'tokens/total': 2942561792, 'tokens/trainable': 1087971968, 'epoch': '2.656'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                              | 1551/1751 [25:54:42<3:20:49, 60.25s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                              | 1552/1751 [25:55:42<3:19:18, 60.09s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5154', 'grad_norm': '0.1738', 'learning_rate': '7.045e-07', 'ppl': '1.674', 'memory/max_active (GiB)': '77.32', 'memory/max_allocated (GiB)': '77.32', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '96.3', 'tokens/total': 2944447232, 'tokens/trainable': 1088682880, 'epoch': '2.658'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                              | 1552/1751 [25:55:42<3:19:18, 60.09s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                              | 1553/1751 [25:56:41<3:17:31, 59.86s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5146', 'grad_norm': '0.1699', 'learning_rate': '6.975e-07', 'ppl': '1.673', 'memory/max_active (GiB)': '69.88', 'memory/max_allocated (GiB)': '69.88', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '32.69', 'tokens/total': 2946321408, 'tokens/trainable': 1089382144, 'epoch': '2.66'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                              | 1553/1751 [25:56:41<3:17:31, 59.86s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                              | 1554/1751 [25:57:43<3:18:48, 60.55s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5439', 'grad_norm': '0.1816', 'learning_rate': '6.906e-07', 'ppl': '1.723', 'memory/max_active (GiB)': '76.38', 'memory/max_allocated (GiB)': '76.38', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '38.59', 'tokens/total': 2948330240, 'tokens/trainable': 1090095488, 'epoch': '2.661'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                              | 1554/1751 [25:57:43<3:18:48, 60.55s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 1555/1751 [25:58:45<3:18:33, 60.78s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5173', 'grad_norm': '0.1797', 'learning_rate': '6.837e-07', 'ppl': '1.678', 'memory/max_active (GiB)': '73.54', 'memory/max_allocated (GiB)': '73.54', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '114', 'tokens/total': 2950238976, 'tokens/trainable': 1090795008, 'epoch': '2.663'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 1555/1751 [25:58:45<3:18:33, 60.78s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                             | 1556/1751 [25:59:44<3:16:00, 60.31s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5486', 'grad_norm': '0.1865', 'learning_rate': '6.769e-07', 'ppl': '1.731', 'memory/max_active (GiB)': '70.47', 'memory/max_allocated (GiB)': '70.47', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '65.5', 'tokens/total': 2952075264, 'tokens/trainable': 1091477504, 'epoch': '2.665'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                             | 1556/1751 [25:59:44<3:16:00, 60.31s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                             | 1557/1751 [26:00:40<3:10:54, 59.04s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5431', 'grad_norm': '0.1895', 'learning_rate': '6.701e-07', 'ppl': '1.721', 'memory/max_active (GiB)': '77.15', 'memory/max_allocated (GiB)': '77.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '71.24', 'tokens/total': 2953801728, 'tokens/trainable': 1092108672, 'epoch': '2.667'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                             | 1557/1751 [26:00:40<3:10:54, 59.04s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                             | 1558/1751 [26:01:40<3:10:48, 59.32s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5024', 'grad_norm': '0.1836', 'learning_rate': '6.633e-07', 'ppl': '1.653', 'memory/max_active (GiB)': '75.38', 'memory/max_allocated (GiB)': '75.38', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '105.6', 'tokens/total': 2955667712, 'tokens/trainable': 1092793984, 'epoch': '2.668'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                             | 1558/1751 [26:01:40<3:10:48, 59.32s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                             | 1559/1751 [26:02:41<3:11:01, 59.70s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5305', 'grad_norm': '0.1816', 'learning_rate': '6.565e-07', 'ppl': '1.7', 'memory/max_active (GiB)': '76.04', 'memory/max_allocated (GiB)': '76.04', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '95.8', 'tokens/total': 2957520896, 'tokens/trainable': 1093495168, 'epoch': '2.67'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                             | 1559/1751 [26:02:41<3:11:01, 59.70s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                             | 1560/1751 [26:03:43<3:12:12, 60.38s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5123', 'grad_norm': '0.1709', 'learning_rate': '6.498e-07', 'ppl': '1.669', 'memory/max_active (GiB)': '77.76', 'memory/max_allocated (GiB)': '77.76', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '153.1', 'tokens/total': 2959544320, 'tokens/trainable': 1094231552, 'epoch': '2.672'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                             | 1560/1751 [26:03:43<3:12:12, 60.38s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 1561/1751 [26:04:42<3:09:54, 59.97s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5514', 'grad_norm': '0.1963', 'learning_rate': '6.432e-07', 'ppl': '1.736', 'memory/max_active (GiB)': '77.14', 'memory/max_allocated (GiB)': '77.14', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '83.18', 'tokens/total': 2961367808, 'tokens/trainable': 1094877696, 'epoch': '2.673'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 1561/1751 [26:04:42<3:09:54, 59.97s/it] 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                             | 1562/1751 [26:05:42<3:09:38, 60.20s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5218', 'grad_norm': '0.1699', 'learning_rate': '6.365e-07', 'ppl': '1.685', 'memory/max_active (GiB)': '76.41', 'memory/max_allocated (GiB)': '76.41', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '86.51', 'tokens/total': 2963303936, 'tokens/trainable': 1095607040, 'epoch': '2.675'}
 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                             | 1562/1751 [26:05:42<3:09:38, 60.20s/it] 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                            | 1563/1751 [26:06:43<3:09:09, 60.37s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5426', 'grad_norm': '0.1855', 'learning_rate': '6.299e-07', 'ppl': '1.72', 'memory/max_active (GiB)': '69.56', 'memory/max_allocated (GiB)': '69.56', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '30.18', 'tokens/total': 2965229824, 'tokens/trainable': 1096310528, 'epoch': '2.677'}
 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                            | 1563/1751 [26:06:43<3:09:09, 60.37s/it] 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                            | 1564/1751 [26:07:46<3:10:43, 61.20s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4972', 'grad_norm': '0.1689', 'learning_rate': '6.233e-07', 'ppl': '1.644', 'memory/max_active (GiB)': '75.16', 'memory/max_allocated (GiB)': '75.16', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '94.87', 'tokens/total': 2967249664, 'tokens/trainable': 1097054208, 'epoch': '2.679'}
 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                            | 1564/1751 [26:07:46<3:10:43, 61.20s/it] 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 1565/1751 [26:08:47<3:09:39, 61.18s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4801', 'grad_norm': '0.165', 'learning_rate': '6.168e-07', 'ppl': '1.616', 'memory/max_active (GiB)': '75.76', 'memory/max_allocated (GiB)': '75.76', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '177.2', 'tokens/total': 2969185280, 'tokens/trainable': 1097744000, 'epoch': '2.68'}
 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 1565/1751 [26:08:47<3:09:39, 61.18s/it] 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                            | 1566/1751 [26:09:47<3:07:26, 60.79s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5361', 'grad_norm': '0.1777', 'learning_rate': '6.103e-07', 'ppl': '1.709', 'memory/max_active (GiB)': '73.41', 'memory/max_allocated (GiB)': '73.41', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '30.75', 'tokens/total': 2971063296, 'tokens/trainable': 1098435968, 'epoch': '2.682'}
 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                            | 1566/1751 [26:09:47<3:07:26, 60.79s/it] 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                            | 1567/1751 [26:10:46<3:04:32, 60.18s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5257', 'grad_norm': '0.1875', 'learning_rate': '6.038e-07', 'ppl': '1.692', 'memory/max_active (GiB)': '70.97', 'memory/max_allocated (GiB)': '70.97', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '145.2', 'tokens/total': 2972857088, 'tokens/trainable': 1099096192, 'epoch': '2.684'}
 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                            | 1567/1751 [26:10:46<3:04:32, 60.18s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                            | 1568/1751 [26:11:47<3:03:53, 60.29s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5586', 'grad_norm': '0.1855', 'learning_rate': '5.973e-07', 'ppl': '1.748', 'memory/max_active (GiB)': '74.63', 'memory/max_allocated (GiB)': '74.63', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '92.05', 'tokens/total': 2974780928, 'tokens/trainable': 1099795968, 'epoch': '2.685'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                            | 1568/1751 [26:11:47<3:03:53, 60.29s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                           | 1569/1751 [26:12:47<3:03:11, 60.39s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5075', 'grad_norm': '0.1729', 'learning_rate': '5.909e-07', 'ppl': '1.661', 'memory/max_active (GiB)': '75.8', 'memory/max_allocated (GiB)': '75.8', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '64.22', 'tokens/total': 2976684544, 'tokens/trainable': 1100490624, 'epoch': '2.687'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                           | 1569/1751 [26:12:47<3:03:11, 60.39s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 1570/1751 [26:13:50<3:04:17, 61.09s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4777', 'grad_norm': '0.1621', 'learning_rate': '5.846e-07', 'ppl': '1.612', 'memory/max_active (GiB)': '75.97', 'memory/max_allocated (GiB)': '75.97', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '116.2', 'tokens/total': 2978712320, 'tokens/trainable': 1101243392, 'epoch': '2.689'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 1570/1751 [26:13:50<3:04:17, 61.09s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                           | 1571/1751 [26:14:52<3:03:44, 61.25s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4936', 'grad_norm': '0.1807', 'learning_rate': '5.782e-07', 'ppl': '1.638', 'memory/max_active (GiB)': '75.78', 'memory/max_allocated (GiB)': '75.78', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '142.3', 'tokens/total': 2980666624, 'tokens/trainable': 1101959296, 'epoch': '2.691'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                           | 1571/1751 [26:14:52<3:03:44, 61.25s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 1572/1751 [26:15:50<3:00:08, 60.38s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5694', 'grad_norm': '0.1807', 'learning_rate': '5.719e-07', 'ppl': '1.767', 'memory/max_active (GiB)': '68.07', 'memory/max_allocated (GiB)': '68.07', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '59.33', 'tokens/total': 2982481408, 'tokens/trainable': 1102676992, 'epoch': '2.692'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 1572/1751 [26:15:50<3:00:08, 60.38s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                           | 1573/1751 [26:16:50<2:59:16, 60.43s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5307', 'grad_norm': '0.1816', 'learning_rate': '5.656e-07', 'ppl': '1.7', 'memory/max_active (GiB)': '74.98', 'memory/max_allocated (GiB)': '74.98', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '155.7', 'tokens/total': 2984387072, 'tokens/trainable': 1103374976, 'epoch': '2.694'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                           | 1573/1751 [26:16:50<2:59:16, 60.43s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                           | 1574/1751 [26:17:48<2:56:00, 59.66s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5341', 'grad_norm': '0.1895', 'learning_rate': '5.594e-07', 'ppl': '1.706', 'memory/max_active (GiB)': '73.34', 'memory/max_allocated (GiB)': '73.34', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '9.511', 'tokens/total': 2986201088, 'tokens/trainable': 1104043520, 'epoch': '2.696'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                           | 1574/1751 [26:17:48<2:56:00, 59.66s/it] 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 1575/1751 [26:18:48<2:55:18, 59.76s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5236', 'grad_norm': '0.1816', 'learning_rate': '5.532e-07', 'ppl': '1.688', 'memory/max_active (GiB)': '71.45', 'memory/max_allocated (GiB)': '71.45', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '107', 'tokens/total': 2988097280, 'tokens/trainable': 1104748160, 'epoch': '2.697'}
 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 1575/1751 [26:18:48<2:55:18, 59.76s/it] 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                          | 1576/1751 [26:19:49<2:55:27, 60.16s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5034', 'grad_norm': '0.1582', 'learning_rate': '5.47e-07', 'ppl': '1.654', 'memory/max_active (GiB)': '76.22', 'memory/max_allocated (GiB)': '76.22', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '77.54', 'tokens/total': 2990048768, 'tokens/trainable': 1105488896, 'epoch': '2.699'}
 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                          | 1576/1751 [26:19:49<2:55:27, 60.16s/it] 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                          | 1577/1751 [26:20:50<2:54:37, 60.22s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5038', 'grad_norm': '0.1768', 'learning_rate': '5.409e-07', 'ppl': '1.655', 'memory/max_active (GiB)': '73.04', 'memory/max_allocated (GiB)': '73.04', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '84.62', 'tokens/total': 2991963392, 'tokens/trainable': 1106167680, 'epoch': '2.701'}
 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                          | 1577/1751 [26:20:50<2:54:37, 60.22s/it] 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 1578/1751 [26:21:51<2:54:54, 60.66s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5248', 'grad_norm': '0.168', 'learning_rate': '5.348e-07', 'ppl': '1.69', 'memory/max_active (GiB)': '71.9', 'memory/max_allocated (GiB)': '71.9', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '81.95', 'tokens/total': 2993936384, 'tokens/trainable': 1106906240, 'epoch': '2.703'}
 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 1578/1751 [26:21:51<2:54:54, 60.66s/it] 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                          | 1579/1751 [26:22:52<2:53:56, 60.68s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4944', 'grad_norm': '0.1738', 'learning_rate': '5.287e-07', 'ppl': '1.64', 'memory/max_active (GiB)': '72.83', 'memory/max_allocated (GiB)': '72.83', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '111.6', 'tokens/total': 2995879168, 'tokens/trainable': 1107645696, 'epoch': '2.704'}
 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                          | 1579/1751 [26:22:52<2:53:56, 60.68s/it] 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                          | 1580/1751 [26:23:51<2:51:28, 60.17s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.535', 'grad_norm': '0.1826', 'learning_rate': '5.226e-07', 'ppl': '1.708', 'memory/max_active (GiB)': '74.86', 'memory/max_allocated (GiB)': '74.86', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '64.82', 'tokens/total': 2997716224, 'tokens/trainable': 1108321792, 'epoch': '2.706'}
 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                          | 1580/1751 [26:23:51<2:51:28, 60.17s/it] 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                          | 1581/1751 [26:24:52<2:50:50, 60.30s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.519', 'grad_norm': '0.1768', 'learning_rate': '5.166e-07', 'ppl': '1.68', 'memory/max_active (GiB)': '74.65', 'memory/max_allocated (GiB)': '74.65', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '119.3', 'tokens/total': 2999588864, 'tokens/trainable': 1109038592, 'epoch': '2.708'}
 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                          | 1581/1751 [26:24:52<2:50:50, 60.30s/it] 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 1582/1751 [26:25:54<2:51:48, 61.00s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4854', 'grad_norm': '0.1689', 'learning_rate': '5.107e-07', 'ppl': '1.625', 'memory/max_active (GiB)': '71.97', 'memory/max_allocated (GiB)': '71.97', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '158.9', 'tokens/total': 3001562624, 'tokens/trainable': 1109749632, 'epoch': '2.709'}
 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 1582/1751 [26:25:54<2:51:48, 61.00s/it] 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 1583/1751 [26:26:57<2:52:22, 61.57s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.496', 'grad_norm': '0.168', 'learning_rate': '5.047e-07', 'ppl': '1.642', 'memory/max_active (GiB)': '72.09', 'memory/max_allocated (GiB)': '72.09', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '47.96', 'tokens/total': 3003607040, 'tokens/trainable': 1110509568, 'epoch': '2.711'}
 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 1583/1751 [26:26:57<2:52:22, 61.57s/it] 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                         | 1584/1751 [26:27:56<2:48:54, 60.68s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5213', 'grad_norm': '0.1699', 'learning_rate': '4.988e-07', 'ppl': '1.684', 'memory/max_active (GiB)': '72.88', 'memory/max_allocated (GiB)': '72.88', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '60.54', 'tokens/total': 3005463296, 'tokens/trainable': 1111218432, 'epoch': '2.713'}
 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                         | 1584/1751 [26:27:56<2:48:54, 60.68s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                         | 1585/1751 [26:28:55<2:46:45, 60.28s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5326', 'grad_norm': '0.1836', 'learning_rate': '4.929e-07', 'ppl': '1.703', 'memory/max_active (GiB)': '74.17', 'memory/max_allocated (GiB)': '74.17', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '107.9', 'tokens/total': 3007332352, 'tokens/trainable': 1111926272, 'epoch': '2.715'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                         | 1585/1751 [26:28:55<2:46:45, 60.28s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                         | 1586/1751 [26:29:56<2:46:30, 60.55s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5241', 'grad_norm': '0.1797', 'learning_rate': '4.871e-07', 'ppl': '1.689', 'memory/max_active (GiB)': '76.6', 'memory/max_allocated (GiB)': '76.6', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '112.6', 'tokens/total': 3009267712, 'tokens/trainable': 1112633344, 'epoch': '2.716'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                         | 1586/1751 [26:29:56<2:46:30, 60.55s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                         | 1587/1751 [26:30:57<2:45:47, 60.66s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5095', 'grad_norm': '0.1846', 'learning_rate': '4.813e-07', 'ppl': '1.664', 'memory/max_active (GiB)': '75.14', 'memory/max_allocated (GiB)': '75.14', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '10.3', 'tokens/total': 3011187200, 'tokens/trainable': 1113372288, 'epoch': '2.718'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                         | 1587/1751 [26:30:57<2:45:47, 60.66s/it] 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                         | 1588/1751 [26:31:56<2:43:03, 60.02s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5327', 'grad_norm': '0.1816', 'learning_rate': '4.755e-07', 'ppl': '1.704', 'memory/max_active (GiB)': '74.05', 'memory/max_allocated (GiB)': '74.05', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '74.57', 'tokens/total': 3013033728, 'tokens/trainable': 1114069888, 'epoch': '2.72'}
 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                         | 1588/1751 [26:31:56<2:43:03, 60.02s/it] 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                        | 1589/1751 [26:32:57<2:43:19, 60.49s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4561', 'grad_norm': '0.167', 'learning_rate': '4.698e-07', 'ppl': '1.578', 'memory/max_active (GiB)': '75.79', 'memory/max_allocated (GiB)': '75.79', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '147.5', 'tokens/total': 3014998272, 'tokens/trainable': 1114782080, 'epoch': '2.721'}
 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                        | 1589/1751 [26:32:57<2:43:19, 60.49s/it] 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                        | 1590/1751 [26:33:56<2:40:27, 59.80s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5503', 'grad_norm': '0.1885', 'learning_rate': '4.641e-07', 'ppl': '1.734', 'memory/max_active (GiB)': '69.43', 'memory/max_allocated (GiB)': '69.43', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '67.25', 'tokens/total': 3016866560, 'tokens/trainable': 1115482752, 'epoch': '2.723'}
 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                        | 1590/1751 [26:33:56<2:40:27, 59.80s/it] 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                        | 1591/1751 [26:34:57<2:40:40, 60.25s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5115', 'grad_norm': '0.1709', 'learning_rate': '4.584e-07', 'ppl': '1.668', 'memory/max_active (GiB)': '73.21', 'memory/max_allocated (GiB)': '73.21', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '151.5', 'tokens/total': 3018814720, 'tokens/trainable': 1116236416, 'epoch': '2.725'}
 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                        | 1591/1751 [26:34:57<2:40:40, 60.25s/it] 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                        | 1592/1751 [26:35:56<2:38:54, 59.96s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5537', 'grad_norm': '0.1914', 'learning_rate': '4.528e-07', 'ppl': '1.74', 'memory/max_active (GiB)': '74.81', 'memory/max_allocated (GiB)': '74.81', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '136.3', 'tokens/total': 3020729344, 'tokens/trainable': 1116904832, 'epoch': '2.727'}
 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                        | 1592/1751 [26:35:56<2:38:54, 59.96s/it] 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                        | 1593/1751 [26:36:54<2:36:15, 59.34s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5434', 'grad_norm': '0.1826', 'learning_rate': '4.472e-07', 'ppl': '1.722', 'memory/max_active (GiB)': '77.15', 'memory/max_allocated (GiB)': '77.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '100.5', 'tokens/total': 3022543104, 'tokens/trainable': 1117580672, 'epoch': '2.728'}
 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                        | 1593/1751 [26:36:54<2:36:15, 59.34s/it] 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                        | 1594/1751 [26:37:54<2:35:57, 59.60s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5582', 'grad_norm': '0.1846', 'learning_rate': '4.416e-07', 'ppl': '1.747', 'memory/max_active (GiB)': '76.72', 'memory/max_allocated (GiB)': '76.72', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '98.46', 'tokens/total': 3024444416, 'tokens/trainable': 1118285696, 'epoch': '2.73'}
 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                        | 1594/1751 [26:37:54<2:35:57, 59.60s/it] 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                        | 1595/1751 [26:38:54<2:35:11, 59.69s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5385', 'grad_norm': '0.1895', 'learning_rate': '4.361e-07', 'ppl': '1.713', 'memory/max_active (GiB)': '76.53', 'memory/max_allocated (GiB)': '76.53', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '144.4', 'tokens/total': 3026370304, 'tokens/trainable': 1118971392, 'epoch': '2.732'}
 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                        | 1595/1751 [26:38:54<2:35:11, 59.69s/it] 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 1596/1751 [26:39:54<2:34:32, 59.82s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5269', 'grad_norm': '0.1816', 'learning_rate': '4.306e-07', 'ppl': '1.694', 'memory/max_active (GiB)': '72.76', 'memory/max_allocated (GiB)': '72.76', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '212.9', 'tokens/total': 3028316672, 'tokens/trainable': 1119686144, 'epoch': '2.733'}
 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 1596/1751 [26:39:54<2:34:32, 59.82s/it] 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                       | 1597/1751 [26:40:58<2:36:43, 61.06s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.484', 'grad_norm': '0.1592', 'learning_rate': '4.251e-07', 'ppl': '1.623', 'memory/max_active (GiB)': '75.83', 'memory/max_allocated (GiB)': '75.83', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '97.67', 'tokens/total': 3030396928, 'tokens/trainable': 1120474240, 'epoch': '2.735'}
 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                       | 1597/1751 [26:40:58<2:36:43, 61.06s/it] 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                       | 1598/1751 [26:42:00<2:36:16, 61.29s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4824', 'grad_norm': '0.1738', 'learning_rate': '4.197e-07', 'ppl': '1.62', 'memory/max_active (GiB)': '75.78', 'memory/max_allocated (GiB)': '75.78', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '111.1', 'tokens/total': 3032359168, 'tokens/trainable': 1121198464, 'epoch': '2.737'}
 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                       | 1598/1751 [26:42:00<2:36:16, 61.29s/it] 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                       | 1599/1751 [26:43:00<2:33:57, 60.77s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4867', 'grad_norm': '0.165', 'learning_rate': '4.143e-07', 'ppl': '1.627', 'memory/max_active (GiB)': '70.57', 'memory/max_allocated (GiB)': '70.57', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '135.9', 'tokens/total': 3034279424, 'tokens/trainable': 1121926144, 'epoch': '2.739'}
 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                       | 1599/1751 [26:43:00<2:33:57, 60.77s/it] 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                       | 1600/1751 [26:43:59<2:31:34, 60.23s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4997', 'grad_norm': '0.1748', 'learning_rate': '4.089e-07', 'ppl': '1.648', 'memory/max_active (GiB)': '69.05', 'memory/max_allocated (GiB)': '69.05', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '67.67', 'tokens/total': 3036179456, 'tokens/trainable': 1122627072, 'epoch': '2.74'}
 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                       | 1600/1751 [26:43:59<2:31:34, 60.23s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                       | 1601/1751 [26:44:57<2:29:10, 59.67s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5452', 'grad_norm': '0.1846', 'learning_rate': '4.036e-07', 'ppl': '1.725', 'memory/max_active (GiB)': '73.49', 'memory/max_allocated (GiB)': '73.49', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '104.1', 'tokens/total': 3037980672, 'tokens/trainable': 1123287424, 'epoch': '2.742'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                       | 1601/1751 [26:44:57<2:29:10, 59.67s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 1602/1751 [26:45:58<2:29:24, 60.16s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5133', 'grad_norm': '0.1719', 'learning_rate': '3.983e-07', 'ppl': '1.671', 'memory/max_active (GiB)': '75.54', 'memory/max_allocated (GiB)': '75.54', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '36.4', 'tokens/total': 3039934976, 'tokens/trainable': 1124041344, 'epoch': '2.744'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 1602/1751 [26:45:58<2:29:24, 60.16s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                      | 1603/1751 [26:46:59<2:28:35, 60.24s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4861', 'grad_norm': '0.1748', 'learning_rate': '3.931e-07', 'ppl': '1.626', 'memory/max_active (GiB)': '74.24', 'memory/max_allocated (GiB)': '74.24', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '110.6', 'tokens/total': 3041819904, 'tokens/trainable': 1124760832, 'epoch': '2.745'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                      | 1603/1751 [26:46:59<2:28:35, 60.24s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                      | 1604/1751 [26:48:02<2:29:26, 61.00s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4847', 'grad_norm': '0.1719', 'learning_rate': '3.878e-07', 'ppl': '1.624', 'memory/max_active (GiB)': '74.18', 'memory/max_allocated (GiB)': '74.18', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '59.46', 'tokens/total': 3043826176, 'tokens/trainable': 1125495424, 'epoch': '2.747'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                      | 1604/1751 [26:48:02<2:29:26, 61.00s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                      | 1605/1751 [26:49:02<2:27:48, 60.74s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4967', 'grad_norm': '0.1689', 'learning_rate': '3.827e-07', 'ppl': '1.643', 'memory/max_active (GiB)': '72.88', 'memory/max_allocated (GiB)': '72.88', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '161.8', 'tokens/total': 3045728768, 'tokens/trainable': 1126208384, 'epoch': '2.749'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                      | 1605/1751 [26:49:02<2:27:48, 60.74s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                      | 1606/1751 [26:50:03<2:27:05, 60.86s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5037', 'grad_norm': '0.1699', 'learning_rate': '3.775e-07', 'ppl': '1.655', 'memory/max_active (GiB)': '74.93', 'memory/max_allocated (GiB)': '74.93', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '54.69', 'tokens/total': 3047672832, 'tokens/trainable': 1126946048, 'epoch': '2.751'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                      | 1606/1751 [26:50:03<2:27:05, 60.86s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                      | 1607/1751 [26:51:02<2:24:51, 60.36s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5109', 'grad_norm': '0.1816', 'learning_rate': '3.724e-07', 'ppl': '1.667', 'memory/max_active (GiB)': '71.52', 'memory/max_allocated (GiB)': '71.52', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '67.31', 'tokens/total': 3049534720, 'tokens/trainable': 1127629440, 'epoch': '2.752'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                      | 1607/1751 [26:51:02<2:24:51, 60.36s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                      | 1608/1751 [26:52:02<2:23:40, 60.28s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5084', 'grad_norm': '0.1826', 'learning_rate': '3.673e-07', 'ppl': '1.663', 'memory/max_active (GiB)': '74.69', 'memory/max_allocated (GiB)': '74.69', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '95.45', 'tokens/total': 3051403776, 'tokens/trainable': 1128296832, 'epoch': '2.754'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                      | 1608/1751 [26:52:02<2:23:40, 60.28s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                     | 1609/1751 [26:53:05<2:24:11, 60.93s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4781', 'grad_norm': '0.1719', 'learning_rate': '3.622e-07', 'ppl': '1.613', 'memory/max_active (GiB)': '71.72', 'memory/max_allocated (GiB)': '71.72', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '67.04', 'tokens/total': 3053391872, 'tokens/trainable': 1129029888, 'epoch': '2.756'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                     | 1609/1751 [26:53:05<2:24:11, 60.93s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 1610/1751 [26:54:07<2:24:07, 61.33s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4969', 'grad_norm': '0.1758', 'learning_rate': '3.572e-07', 'ppl': '1.644', 'memory/max_active (GiB)': '72.28', 'memory/max_allocated (GiB)': '72.28', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '128', 'tokens/total': 3055376896, 'tokens/trainable': 1129742080, 'epoch': '2.757'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 1610/1751 [26:54:07<2:24:07, 61.33s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                     | 1611/1751 [26:55:09<2:23:33, 61.52s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.524', 'grad_norm': '0.1748', 'learning_rate': '3.522e-07', 'ppl': '1.689', 'memory/max_active (GiB)': '76.64', 'memory/max_allocated (GiB)': '76.64', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '146.4', 'tokens/total': 3057346560, 'tokens/trainable': 1130475264, 'epoch': '2.759'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                     | 1611/1751 [26:55:09<2:23:33, 61.52s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                     | 1612/1751 [26:56:06<2:19:46, 60.34s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5367', 'grad_norm': '0.1826', 'learning_rate': '3.473e-07', 'ppl': '1.71', 'memory/max_active (GiB)': '73.99', 'memory/max_allocated (GiB)': '73.99', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '78.33', 'tokens/total': 3059136256, 'tokens/trainable': 1131168000, 'epoch': '2.761'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                     | 1612/1751 [26:56:06<2:19:46, 60.34s/it] 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 1613/1751 [26:57:07<2:19:19, 60.57s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5136', 'grad_norm': '0.1631', 'learning_rate': '3.424e-07', 'ppl': '1.671', 'memory/max_active (GiB)': '71.73', 'memory/max_allocated (GiB)': '71.73', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '81.03', 'tokens/total': 3061074432, 'tokens/trainable': 1131910656, 'epoch': '2.763'}
 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 1613/1751 [26:57:07<2:19:19, 60.57s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                     | 1614/1751 [26:58:08<2:18:18, 60.57s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4853', 'grad_norm': '0.1709', 'learning_rate': '3.375e-07', 'ppl': '1.625', 'memory/max_active (GiB)': '74.05', 'memory/max_allocated (GiB)': '74.05', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '55.55', 'tokens/total': 3062998016, 'tokens/trainable': 1132630272, 'epoch': '2.764'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                     | 1614/1751 [26:58:08<2:18:18, 60.57s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                    | 1615/1751 [26:59:07<2:16:20, 60.15s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5606', 'grad_norm': '0.1904', 'learning_rate': '3.326e-07', 'ppl': '1.752', 'memory/max_active (GiB)': '71.98', 'memory/max_allocated (GiB)': '71.98', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '129.9', 'tokens/total': 3064827648, 'tokens/trainable': 1133264640, 'epoch': '2.766'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                    | 1615/1751 [26:59:07<2:16:20, 60.15s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 1616/1751 [27:00:08<2:15:40, 60.30s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5109', 'grad_norm': '0.1787', 'learning_rate': '3.278e-07', 'ppl': '1.667', 'memory/max_active (GiB)': '72.41', 'memory/max_allocated (GiB)': '72.41', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '161', 'tokens/total': 3066785536, 'tokens/trainable': 1133988480, 'epoch': '2.768'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 1616/1751 [27:00:08<2:15:40, 60.30s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                    | 1617/1751 [27:01:08<2:14:40, 60.31s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5171', 'grad_norm': '0.1719', 'learning_rate': '3.231e-07', 'ppl': '1.677', 'memory/max_active (GiB)': '71.72', 'memory/max_allocated (GiB)': '71.72', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '100.3', 'tokens/total': 3068646144, 'tokens/trainable': 1134680192, 'epoch': '2.769'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                    | 1617/1751 [27:01:08<2:14:40, 60.31s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                    | 1618/1751 [27:02:07<2:12:48, 59.92s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5196', 'grad_norm': '0.1738', 'learning_rate': '3.183e-07', 'ppl': '1.681', 'memory/max_active (GiB)': '67.73', 'memory/max_allocated (GiB)': '67.73', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '145.3', 'tokens/total': 3070535680, 'tokens/trainable': 1135375616, 'epoch': '2.771'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                    | 1618/1751 [27:02:07<2:12:48, 59.92s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                    | 1619/1751 [27:03:07<2:11:49, 59.92s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5159', 'grad_norm': '0.1943', 'learning_rate': '3.136e-07', 'ppl': '1.675', 'memory/max_active (GiB)': '77.46', 'memory/max_allocated (GiB)': '77.46', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '35.25', 'tokens/total': 3072431872, 'tokens/trainable': 1136067328, 'epoch': '2.773'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                    | 1619/1751 [27:03:07<2:11:49, 59.92s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                    | 1620/1751 [27:04:08<2:11:28, 60.22s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5302', 'grad_norm': '0.1768', 'learning_rate': '3.089e-07', 'ppl': '1.699', 'memory/max_active (GiB)': '74.75', 'memory/max_allocated (GiB)': '74.75', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '172.6', 'tokens/total': 3074314496, 'tokens/trainable': 1136775680, 'epoch': '2.775'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                    | 1620/1751 [27:04:08<2:11:28, 60.22s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 1621/1751 [27:05:08<2:10:38, 60.29s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5236', 'grad_norm': '0.1816', 'learning_rate': '3.043e-07', 'ppl': '1.688', 'memory/max_active (GiB)': '66.93', 'memory/max_allocated (GiB)': '66.93', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '147.2', 'tokens/total': 3076193024, 'tokens/trainable': 1137477888, 'epoch': '2.776'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                    | 1621/1751 [27:05:09<2:10:38, 60.29s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 1622/1751 [27:06:08<2:09:09, 60.08s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5268', 'grad_norm': '0.1885', 'learning_rate': '2.997e-07', 'ppl': '1.694', 'memory/max_active (GiB)': '74.88', 'memory/max_allocated (GiB)': '74.88', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '95.63', 'tokens/total': 3078051072, 'tokens/trainable': 1138165504, 'epoch': '2.778'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 1622/1751 [27:06:08<2:09:09, 60.08s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                   | 1623/1751 [27:07:05<2:06:09, 59.14s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5578', 'grad_norm': '0.1963', 'learning_rate': '2.951e-07', 'ppl': '1.747', 'memory/max_active (GiB)': '74.77', 'memory/max_allocated (GiB)': '74.77', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '82.63', 'tokens/total': 3079798016, 'tokens/trainable': 1138789888, 'epoch': '2.78'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                   | 1623/1751 [27:07:05<2:06:09, 59.14s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                   | 1624/1751 [27:08:04<2:04:54, 59.01s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5133', 'grad_norm': '0.1758', 'learning_rate': '2.906e-07', 'ppl': '1.671', 'memory/max_active (GiB)': '73.43', 'memory/max_allocated (GiB)': '73.43', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '42.79', 'tokens/total': 3081669888, 'tokens/trainable': 1139462016, 'epoch': '2.781'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                   | 1624/1751 [27:08:04<2:04:54, 59.01s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 1625/1751 [27:09:06<2:05:53, 59.95s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5232', 'grad_norm': '0.1758', 'learning_rate': '2.861e-07', 'ppl': '1.687', 'memory/max_active (GiB)': '75.46', 'memory/max_allocated (GiB)': '75.46', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '135.9', 'tokens/total': 3083649024, 'tokens/trainable': 1140167296, 'epoch': '2.783'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 1625/1751 [27:09:06<2:05:53, 59.95s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                   | 1626/1751 [27:10:09<2:06:35, 60.76s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4891', 'grad_norm': '0.1689', 'learning_rate': '2.816e-07', 'ppl': '1.631', 'memory/max_active (GiB)': '76.27', 'memory/max_allocated (GiB)': '76.27', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '142.1', 'tokens/total': 3085597952, 'tokens/trainable': 1140903552, 'epoch': '2.785'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                   | 1626/1751 [27:10:09<2:06:35, 60.76s/it] 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                   | 1627/1751 [27:11:07<2:04:05, 60.05s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5165', 'grad_norm': '0.1768', 'learning_rate': '2.772e-07', 'ppl': '1.676', 'memory/max_active (GiB)': '78', 'memory/max_allocated (GiB)': '78', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '21.14', 'tokens/total': 3087455744, 'tokens/trainable': 1141592448, 'epoch': '2.787'}
 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                   | 1627/1751 [27:11:07<2:04:05, 60.05s/it] 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                  | 1628/1751 [27:12:05<2:01:41, 59.36s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5561', 'grad_norm': '0.1924', 'learning_rate': '2.728e-07', 'ppl': '1.744', 'memory/max_active (GiB)': '72.56', 'memory/max_allocated (GiB)': '72.56', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '125.3', 'tokens/total': 3089297408, 'tokens/trainable': 1142244096, 'epoch': '2.788'}
 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                  | 1628/1751 [27:12:05<2:01:41, 59.36s/it] 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                  | 1629/1751 [27:13:07<2:02:14, 60.12s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5171', 'grad_norm': '0.1758', 'learning_rate': '2.684e-07', 'ppl': '1.677', 'memory/max_active (GiB)': '71.45', 'memory/max_allocated (GiB)': '71.45', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '89.88', 'tokens/total': 3091260416, 'tokens/trainable': 1142987520, 'epoch': '2.79'}
 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                  | 1629/1751 [27:13:07<2:02:14, 60.12s/it] 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 1630/1751 [27:14:06<2:00:48, 59.91s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5202', 'grad_norm': '0.1855', 'learning_rate': '2.641e-07', 'ppl': '1.682', 'memory/max_active (GiB)': '75.15', 'memory/max_allocated (GiB)': '75.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '67.35', 'tokens/total': 3093129728, 'tokens/trainable': 1143665152, 'epoch': '2.792'}
 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 1630/1751 [27:14:06<2:00:48, 59.91s/it] 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 1631/1751 [27:15:04<1:58:33, 59.28s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5539', 'grad_norm': '0.1963', 'learning_rate': '2.598e-07', 'ppl': '1.74', 'memory/max_active (GiB)': '74.38', 'memory/max_allocated (GiB)': '74.38', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '64.31', 'tokens/total': 3094962944, 'tokens/trainable': 1144320512, 'epoch': '2.793'}
 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 1631/1751 [27:15:04<1:58:33, 59.28s/it] 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                  | 1632/1751 [27:16:03<1:57:32, 59.26s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5007', 'grad_norm': '0.1748', 'learning_rate': '2.555e-07', 'ppl': '1.65', 'memory/max_active (GiB)': '75.57', 'memory/max_allocated (GiB)': '75.57', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '108.2', 'tokens/total': 3096840704, 'tokens/trainable': 1145033600, 'epoch': '2.795'}
 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                  | 1632/1751 [27:16:03<1:57:32, 59.26s/it] 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 1633/1751 [27:17:01<1:55:57, 58.96s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5132', 'grad_norm': '0.1943', 'learning_rate': '2.513e-07', 'ppl': '1.671', 'memory/max_active (GiB)': '75.44', 'memory/max_allocated (GiB)': '75.44', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '25.73', 'tokens/total': 3098671616, 'tokens/trainable': 1145693568, 'epoch': '2.797'}
 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 1633/1751 [27:17:01<1:55:57, 58.96s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                  | 1634/1751 [27:18:02<1:55:50, 59.41s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5216', 'grad_norm': '0.1738', 'learning_rate': '2.471e-07', 'ppl': '1.685', 'memory/max_active (GiB)': '73.26', 'memory/max_allocated (GiB)': '73.26', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '29.16', 'tokens/total': 3100541440, 'tokens/trainable': 1146419456, 'epoch': '2.799'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                  | 1634/1751 [27:18:02<1:55:50, 59.41s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                 | 1635/1751 [27:19:01<1:54:56, 59.45s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5424', 'grad_norm': '0.1777', 'learning_rate': '2.43e-07', 'ppl': '1.72', 'memory/max_active (GiB)': '75.14', 'memory/max_allocated (GiB)': '75.14', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '140.5', 'tokens/total': 3102429440, 'tokens/trainable': 1147157248, 'epoch': '2.8'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                 | 1635/1751 [27:19:01<1:54:56, 59.45s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                 | 1636/1751 [27:20:01<1:54:02, 59.50s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5463', 'grad_norm': '0.1865', 'learning_rate': '2.389e-07', 'ppl': '1.727', 'memory/max_active (GiB)': '73.25', 'memory/max_allocated (GiB)': '73.25', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '167.7', 'tokens/total': 3104304640, 'tokens/trainable': 1147861120, 'epoch': '2.802'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                 | 1636/1751 [27:20:01<1:54:02, 59.50s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 1637/1751 [27:21:00<1:52:59, 59.47s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5246', 'grad_norm': '0.1816', 'learning_rate': '2.348e-07', 'ppl': '1.69', 'memory/max_active (GiB)': '68.39', 'memory/max_allocated (GiB)': '68.39', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '148.4', 'tokens/total': 3106168832, 'tokens/trainable': 1148533376, 'epoch': '2.804'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 1637/1751 [27:21:00<1:52:59, 59.47s/it] 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                 | 1638/1751 [27:22:02<1:53:07, 60.07s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4904', 'grad_norm': '0.1719', 'learning_rate': '2.307e-07', 'ppl': '1.633', 'memory/max_active (GiB)': '73.71', 'memory/max_allocated (GiB)': '73.71', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '115.3', 'tokens/total': 3108083712, 'tokens/trainable': 1149259392, 'epoch': '2.805'}
 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                 | 1638/1751 [27:22:02<1:53:07, 60.07s/it] 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                 | 1639/1751 [27:23:03<1:52:38, 60.34s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4975', 'grad_norm': '0.165', 'learning_rate': '2.267e-07', 'ppl': '1.645', 'memory/max_active (GiB)': '75.2', 'memory/max_allocated (GiB)': '75.2', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '189.8', 'tokens/total': 3110049792, 'tokens/trainable': 1150011520, 'epoch': '2.807'}
 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                 | 1639/1751 [27:23:03<1:52:38, 60.34s/it] 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                 | 1640/1751 [27:24:03<1:51:28, 60.26s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5239', 'grad_norm': '0.1826', 'learning_rate': '2.227e-07', 'ppl': '1.689', 'memory/max_active (GiB)': '67.93', 'memory/max_allocated (GiB)': '67.93', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '94.33', 'tokens/total': 3111930624, 'tokens/trainable': 1150689152, 'epoch': '2.809'}
 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                 | 1640/1751 [27:24:03<1:51:28, 60.26s/it] 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                | 1641/1751 [27:25:06<1:51:54, 61.04s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5174', 'grad_norm': '0.1758', 'learning_rate': '2.188e-07', 'ppl': '1.678', 'memory/max_active (GiB)': '74.81', 'memory/max_allocated (GiB)': '74.81', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '47.04', 'tokens/total': 3113896704, 'tokens/trainable': 1151390592, 'epoch': '2.811'}
 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                | 1641/1751 [27:25:06<1:51:54, 61.04s/it] 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                | 1642/1751 [27:26:09<1:52:08, 61.73s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4593', 'grad_norm': '0.1582', 'learning_rate': '2.149e-07', 'ppl': '1.583', 'memory/max_active (GiB)': '74.49', 'memory/max_allocated (GiB)': '74.49', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '107', 'tokens/total': 3115933696, 'tokens/trainable': 1152158720, 'epoch': '2.812'}
 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                | 1642/1751 [27:26:09<1:52:08, 61.73s/it] 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 1643/1751 [27:27:10<1:50:30, 61.39s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5163', 'grad_norm': '0.1777', 'learning_rate': '2.11e-07', 'ppl': '1.676', 'memory/max_active (GiB)': '73.75', 'memory/max_allocated (GiB)': '73.75', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '35.3', 'tokens/total': 3117811456, 'tokens/trainable': 1152871552, 'epoch': '2.814'}
 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 1643/1751 [27:27:10<1:50:30, 61.39s/it] 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                | 1644/1751 [27:28:10<1:49:00, 61.13s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5182', 'grad_norm': '0.1689', 'learning_rate': '2.072e-07', 'ppl': '1.679', 'memory/max_active (GiB)': '72.35', 'memory/max_allocated (GiB)': '72.35', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '98.27', 'tokens/total': 3119693568, 'tokens/trainable': 1153603840, 'epoch': '2.816'}
 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                | 1644/1751 [27:28:10<1:49:00, 61.13s/it] 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 1645/1751 [27:29:12<1:48:14, 61.27s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4797', 'grad_norm': '0.1787', 'learning_rate': '2.034e-07', 'ppl': '1.616', 'memory/max_active (GiB)': '77.97', 'memory/max_allocated (GiB)': '77.97', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '75.19', 'tokens/total': 3121638400, 'tokens/trainable': 1154349440, 'epoch': '2.817'}
 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 1645/1751 [27:29:12<1:48:14, 61.27s/it] 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                | 1646/1751 [27:30:12<1:46:49, 61.05s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5011', 'grad_norm': '0.1729', 'learning_rate': '1.996e-07', 'ppl': '1.65', 'memory/max_active (GiB)': '74.23', 'memory/max_allocated (GiB)': '74.23', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '83.07', 'tokens/total': 3123546880, 'tokens/trainable': 1155041280, 'epoch': '2.819'}
 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                | 1646/1751 [27:30:12<1:46:49, 61.05s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                | 1647/1751 [27:31:12<1:45:20, 60.78s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5345', 'grad_norm': '0.1826', 'learning_rate': '1.958e-07', 'ppl': '1.707', 'memory/max_active (GiB)': '73.61', 'memory/max_allocated (GiB)': '73.61', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '71.64', 'tokens/total': 3125422592, 'tokens/trainable': 1155711104, 'epoch': '2.821'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                | 1647/1751 [27:31:12<1:45:20, 60.78s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏               | 1648/1751 [27:32:10<1:42:44, 59.85s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5451', 'grad_norm': '0.1816', 'learning_rate': '1.921e-07', 'ppl': '1.725', 'memory/max_active (GiB)': '70.67', 'memory/max_allocated (GiB)': '70.67', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '156.6', 'tokens/total': 3127178496, 'tokens/trainable': 1156376704, 'epoch': '2.823'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏               | 1648/1751 [27:32:10<1:42:44, 59.85s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍               | 1649/1751 [27:33:11<1:42:23, 60.23s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5046', 'grad_norm': '0.1699', 'learning_rate': '1.885e-07', 'ppl': '1.656', 'memory/max_active (GiB)': '72.11', 'memory/max_allocated (GiB)': '72.11', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '84.43', 'tokens/total': 3129112064, 'tokens/trainable': 1157096320, 'epoch': '2.824'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍               | 1649/1751 [27:33:11<1:42:23, 60.23s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 1650/1751 [27:34:10<1:40:35, 59.75s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.6084', 'grad_norm': '0.209', 'learning_rate': '1.849e-07', 'ppl': '1.838', 'memory/max_active (GiB)': '74.92', 'memory/max_allocated (GiB)': '74.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '76.66', 'tokens/total': 3130891264, 'tokens/trainable': 1157737856, 'epoch': '2.826'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 1650/1751 [27:34:10<1:40:35, 59.75s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋               | 1651/1751 [27:35:12<1:40:35, 60.36s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5354', 'grad_norm': '0.1807', 'learning_rate': '1.813e-07', 'ppl': '1.708', 'memory/max_active (GiB)': '76.86', 'memory/max_allocated (GiB)': '76.86', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '133.6', 'tokens/total': 3132840192, 'tokens/trainable': 1158448640, 'epoch': '2.828'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋               | 1651/1751 [27:35:12<1:40:35, 60.36s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊               | 1652/1751 [27:36:12<1:39:27, 60.28s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4884', 'grad_norm': '0.1611', 'learning_rate': '1.777e-07', 'ppl': '1.63', 'memory/max_active (GiB)': '71.92', 'memory/max_allocated (GiB)': '71.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '68.39', 'tokens/total': 3134760704, 'tokens/trainable': 1159190016, 'epoch': '2.829'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊               | 1652/1751 [27:36:12<1:39:27, 60.28s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████               | 1653/1751 [27:37:13<1:39:06, 60.68s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5177', 'grad_norm': '0.1738', 'learning_rate': '1.742e-07', 'ppl': '1.678', 'memory/max_active (GiB)': '75.34', 'memory/max_allocated (GiB)': '75.34', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '117.4', 'tokens/total': 3136763904, 'tokens/trainable': 1159933696, 'epoch': '2.831'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████               | 1653/1751 [27:37:13<1:39:06, 60.68s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏              | 1654/1751 [27:38:13<1:37:47, 60.49s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5532', 'grad_norm': '0.1914', 'learning_rate': '1.707e-07', 'ppl': '1.739', 'memory/max_active (GiB)': '71.8', 'memory/max_allocated (GiB)': '71.8', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '91.04', 'tokens/total': 3138676736, 'tokens/trainable': 1160611456, 'epoch': '2.833'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏              | 1654/1751 [27:38:13<1:37:47, 60.49s/it] 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 1655/1751 [27:39:12<1:35:58, 59.98s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5132', 'grad_norm': '0.1748', 'learning_rate': '1.672e-07', 'ppl': '1.671', 'memory/max_active (GiB)': '72.38', 'memory/max_allocated (GiB)': '72.38', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '26.07', 'tokens/total': 3140489472, 'tokens/trainable': 1161299072, 'epoch': '2.835'}
 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 1655/1751 [27:39:12<1:35:58, 59.98s/it] 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍              | 1656/1751 [27:40:15<1:36:15, 60.79s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4806', 'grad_norm': '0.165', 'learning_rate': '1.638e-07', 'ppl': '1.617', 'memory/max_active (GiB)': '75.93', 'memory/max_allocated (GiB)': '75.93', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '21.18', 'tokens/total': 3142525952, 'tokens/trainable': 1162056832, 'epoch': '2.836'}
 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍              | 1656/1751 [27:40:15<1:36:15, 60.79s/it] 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌              | 1657/1751 [27:41:15<1:34:54, 60.57s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5551', 'grad_norm': '0.1768', 'learning_rate': '1.604e-07', 'ppl': '1.742', 'memory/max_active (GiB)': '71.89', 'memory/max_allocated (GiB)': '71.89', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '49.76', 'tokens/total': 3144392960, 'tokens/trainable': 1162738688, 'epoch': '2.838'}
 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌              | 1657/1751 [27:41:15<1:34:54, 60.57s/it] 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 1658/1751 [27:42:15<1:33:49, 60.53s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5111', 'grad_norm': '0.1816', 'learning_rate': '1.571e-07', 'ppl': '1.667', 'memory/max_active (GiB)': '76.52', 'memory/max_allocated (GiB)': '76.52', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '125.2', 'tokens/total': 3146288384, 'tokens/trainable': 1163428480, 'epoch': '2.84'}
 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 1658/1751 [27:42:15<1:33:49, 60.53s/it] 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉              | 1659/1751 [27:43:14<1:32:08, 60.09s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5295', 'grad_norm': '0.1895', 'learning_rate': '1.537e-07', 'ppl': '1.698', 'memory/max_active (GiB)': '74.43', 'memory/max_allocated (GiB)': '74.43', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '69.72', 'tokens/total': 3148108032, 'tokens/trainable': 1164080128, 'epoch': '2.841'}
 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉              | 1659/1751 [27:43:14<1:32:08, 60.09s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████              | 1660/1751 [27:44:16<1:31:44, 60.48s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4978', 'grad_norm': '0.1689', 'learning_rate': '1.505e-07', 'ppl': '1.645', 'memory/max_active (GiB)': '76.88', 'memory/max_allocated (GiB)': '76.88', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '78.66', 'tokens/total': 3150038784, 'tokens/trainable': 1164809344, 'epoch': '2.843'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████              | 1660/1751 [27:44:16<1:31:44, 60.48s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏             | 1661/1751 [27:45:15<1:30:11, 60.13s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5209', 'grad_norm': '0.1797', 'learning_rate': '1.472e-07', 'ppl': '1.684', 'memory/max_active (GiB)': '74.65', 'memory/max_allocated (GiB)': '74.65', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '73.76', 'tokens/total': 3151917824, 'tokens/trainable': 1165510656, 'epoch': '2.845'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏             | 1661/1751 [27:45:15<1:30:11, 60.13s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍             | 1662/1751 [27:46:14<1:28:27, 59.63s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5616', 'grad_norm': '0.1865', 'learning_rate': '1.44e-07', 'ppl': '1.753', 'memory/max_active (GiB)': '71.4', 'memory/max_allocated (GiB)': '71.4', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '45.32', 'tokens/total': 3153695744, 'tokens/trainable': 1166184064, 'epoch': '2.847'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍             | 1662/1751 [27:46:14<1:28:27, 59.63s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌             | 1663/1751 [27:47:14<1:27:43, 59.81s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5552', 'grad_norm': '0.1777', 'learning_rate': '1.408e-07', 'ppl': '1.742', 'memory/max_active (GiB)': '72.71', 'memory/max_allocated (GiB)': '72.71', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '106.7', 'tokens/total': 3155632640, 'tokens/trainable': 1166879360, 'epoch': '2.848'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌             | 1663/1751 [27:47:14<1:27:43, 59.81s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋             | 1664/1751 [27:48:15<1:27:14, 60.17s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5237', 'grad_norm': '0.1758', 'learning_rate': '1.377e-07', 'ppl': '1.688', 'memory/max_active (GiB)': '76.13', 'memory/max_allocated (GiB)': '76.13', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '88.37', 'tokens/total': 3157531648, 'tokens/trainable': 1167596416, 'epoch': '2.85'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋             | 1664/1751 [27:48:15<1:27:14, 60.17s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 1665/1751 [27:49:15<1:26:16, 60.20s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5205', 'grad_norm': '0.1689', 'learning_rate': '1.346e-07', 'ppl': '1.683', 'memory/max_active (GiB)': '74.62', 'memory/max_allocated (GiB)': '74.62', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '99.21', 'tokens/total': 3159438592, 'tokens/trainable': 1168322048, 'epoch': '2.852'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 1665/1751 [27:49:15<1:26:16, 60.20s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 1666/1751 [27:50:13<1:24:22, 59.56s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5761', 'grad_norm': '0.1963', 'learning_rate': '1.315e-07', 'ppl': '1.779', 'memory/max_active (GiB)': '73.5', 'memory/max_allocated (GiB)': '73.5', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '50.23', 'tokens/total': 3161256192, 'tokens/trainable': 1169003264, 'epoch': '2.853'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 1666/1751 [27:50:13<1:24:22, 59.56s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 1667/1751 [27:51:11<1:22:41, 59.07s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5331', 'grad_norm': '0.1816', 'learning_rate': '1.285e-07', 'ppl': '1.704', 'memory/max_active (GiB)': '76.35', 'memory/max_allocated (GiB)': '76.35', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '73.72', 'tokens/total': 3163082496, 'tokens/trainable': 1169686400, 'epoch': '2.855'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 1667/1751 [27:51:11<1:22:41, 59.07s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎            | 1668/1751 [27:52:12<1:22:28, 59.63s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5184', 'grad_norm': '0.1787', 'learning_rate': '1.255e-07', 'ppl': '1.679', 'memory/max_active (GiB)': '74.15', 'memory/max_allocated (GiB)': '74.15', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '74', 'tokens/total': 3165036544, 'tokens/trainable': 1170404224, 'epoch': '2.857'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎            | 1668/1751 [27:52:12<1:22:28, 59.63s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍            | 1669/1751 [27:53:16<1:23:09, 60.85s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5085', 'grad_norm': '0.1768', 'learning_rate': '1.225e-07', 'ppl': '1.663', 'memory/max_active (GiB)': '76.97', 'memory/max_allocated (GiB)': '76.97', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '65.37', 'tokens/total': 3167045632, 'tokens/trainable': 1171161216, 'epoch': '2.859'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍            | 1669/1751 [27:53:16<1:23:09, 60.85s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌            | 1670/1751 [27:54:17<1:22:20, 60.99s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5157', 'grad_norm': '0.1748', 'learning_rate': '1.196e-07', 'ppl': '1.675', 'memory/max_active (GiB)': '71.77', 'memory/max_allocated (GiB)': '71.77', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '46.34', 'tokens/total': 3168952576, 'tokens/trainable': 1171869696, 'epoch': '2.86'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌            | 1670/1751 [27:54:17<1:22:20, 60.99s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊            | 1671/1751 [27:55:19<1:21:37, 61.22s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5454', 'grad_norm': '0.1699', 'learning_rate': '1.167e-07', 'ppl': '1.725', 'memory/max_active (GiB)': '76.24', 'memory/max_allocated (GiB)': '76.24', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '58.37', 'tokens/total': 3170935296, 'tokens/trainable': 1172614144, 'epoch': '2.862'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊            | 1671/1751 [27:55:19<1:21:37, 61.22s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉            | 1672/1751 [27:56:18<1:19:59, 60.76s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5115', 'grad_norm': '0.1689', 'learning_rate': '1.138e-07', 'ppl': '1.668', 'memory/max_active (GiB)': '74.47', 'memory/max_allocated (GiB)': '74.47', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '170.1', 'tokens/total': 3172804864, 'tokens/trainable': 1173320192, 'epoch': '2.864'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉            | 1672/1751 [27:56:18<1:19:59, 60.76s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████            | 1673/1751 [27:57:17<1:18:11, 60.14s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5303', 'grad_norm': '0.1787', 'learning_rate': '1.11e-07', 'ppl': '1.7', 'memory/max_active (GiB)': '75.06', 'memory/max_allocated (GiB)': '75.06', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '161.5', 'tokens/total': 3174630400, 'tokens/trainable': 1173994880, 'epoch': '2.865'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████            | 1673/1751 [27:57:17<1:18:11, 60.14s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 1674/1751 [27:58:15<1:16:25, 59.55s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5449', 'grad_norm': '0.1846', 'learning_rate': '1.082e-07', 'ppl': '1.724', 'memory/max_active (GiB)': '72.71', 'memory/max_allocated (GiB)': '72.71', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '104.2', 'tokens/total': 3176451072, 'tokens/trainable': 1174641792, 'epoch': '2.867'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 1674/1751 [27:58:15<1:16:25, 59.55s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎           | 1675/1751 [27:59:16<1:16:00, 60.01s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5058', 'grad_norm': '0.1699', 'learning_rate': '1.055e-07', 'ppl': '1.658', 'memory/max_active (GiB)': '76.4', 'memory/max_allocated (GiB)': '76.4', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '30.81', 'tokens/total': 3178358016, 'tokens/trainable': 1175350656, 'epoch': '2.869'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎           | 1675/1751 [27:59:16<1:16:00, 60.01s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 1676/1751 [28:00:17<1:15:05, 60.08s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5173', 'grad_norm': '0.1758', 'learning_rate': '1.028e-07', 'ppl': '1.678', 'memory/max_active (GiB)': '76.35', 'memory/max_allocated (GiB)': '76.35', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '40.12', 'tokens/total': 3180249344, 'tokens/trainable': 1176076160, 'epoch': '2.871'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 1676/1751 [28:00:17<1:15:05, 60.08s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋           | 1677/1751 [28:01:15<1:13:37, 59.70s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.533', 'grad_norm': '0.1846', 'learning_rate': '1.001e-07', 'ppl': '1.704', 'memory/max_active (GiB)': '73.64', 'memory/max_allocated (GiB)': '73.64', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '153.3', 'tokens/total': 3182133504, 'tokens/trainable': 1176775808, 'epoch': '2.872'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋           | 1677/1751 [28:01:15<1:13:37, 59.70s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊           | 1678/1751 [28:02:15<1:12:44, 59.79s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4737', 'grad_norm': '0.1738', 'learning_rate': '9.744e-08', 'ppl': '1.606', 'memory/max_active (GiB)': '76.4', 'memory/max_allocated (GiB)': '76.4', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '129.9', 'tokens/total': 3184066048, 'tokens/trainable': 1177463168, 'epoch': '2.874'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊           | 1678/1751 [28:02:15<1:12:44, 59.79s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉           | 1679/1751 [28:03:16<1:12:05, 60.07s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5306', 'grad_norm': '0.1797', 'learning_rate': '9.482e-08', 'ppl': '1.7', 'memory/max_active (GiB)': '75.21', 'memory/max_allocated (GiB)': '75.21', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '107.8', 'tokens/total': 3185995520, 'tokens/trainable': 1178174464, 'epoch': '2.876'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉           | 1679/1751 [28:03:16<1:12:05, 60.07s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 1680/1751 [28:04:15<1:10:32, 59.61s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5397', 'grad_norm': '0.1748', 'learning_rate': '9.225e-08', 'ppl': '1.716', 'memory/max_active (GiB)': '69.25', 'memory/max_allocated (GiB)': '69.25', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '150.1', 'tokens/total': 3187876608, 'tokens/trainable': 1178841728, 'epoch': '2.877'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 1680/1751 [28:04:15<1:10:32, 59.61s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 1681/1751 [28:05:17<1:10:22, 60.32s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5011', 'grad_norm': '0.1709', 'learning_rate': '8.971e-08', 'ppl': '1.651', 'memory/max_active (GiB)': '72.51', 'memory/max_allocated (GiB)': '72.51', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '180.4', 'tokens/total': 3189885184, 'tokens/trainable': 1179613440, 'epoch': '2.879'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 1681/1751 [28:05:17<1:10:22, 60.32s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍          | 1682/1751 [28:06:16<1:08:54, 59.92s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4998', 'grad_norm': '0.1729', 'learning_rate': '8.72e-08', 'ppl': '1.648', 'memory/max_active (GiB)': '78.03', 'memory/max_allocated (GiB)': '78.03', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '34.78', 'tokens/total': 3191757824, 'tokens/trainable': 1180327808, 'epoch': '2.881'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍          | 1682/1751 [28:06:16<1:08:54, 59.92s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌          | 1683/1751 [28:07:14<1:07:16, 59.36s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5268', 'grad_norm': '0.1846', 'learning_rate': '8.473e-08', 'ppl': '1.694', 'memory/max_active (GiB)': '72.88', 'memory/max_allocated (GiB)': '72.88', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '60.46', 'tokens/total': 3193620992, 'tokens/trainable': 1181023360, 'epoch': '2.882'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌          | 1683/1751 [28:07:14<1:07:16, 59.36s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 1684/1751 [28:08:14<1:06:37, 59.66s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.524', 'grad_norm': '0.1689', 'learning_rate': '8.23e-08', 'ppl': '1.689', 'memory/max_active (GiB)': '73.94', 'memory/max_allocated (GiB)': '73.94', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '84.02', 'tokens/total': 3195538432, 'tokens/trainable': 1181740160, 'epoch': '2.884'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋          | 1684/1751 [28:08:14<1:06:37, 59.66s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 1685/1751 [28:09:14<1:05:43, 59.75s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4997', 'grad_norm': '0.1797', 'learning_rate': '7.99e-08', 'ppl': '1.648', 'memory/max_active (GiB)': '72.67', 'memory/max_allocated (GiB)': '72.67', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '121.8', 'tokens/total': 3197441280, 'tokens/trainable': 1182431744, 'epoch': '2.886'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 1685/1751 [28:09:14<1:05:43, 59.75s/it] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████          | 1686/1751 [28:10:16<1:05:29, 60.45s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5357', 'grad_norm': '0.1689', 'learning_rate': '7.753e-08', 'ppl': '1.709', 'memory/max_active (GiB)': '73.07', 'memory/max_allocated (GiB)': '73.07', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '64.4', 'tokens/total': 3199414272, 'tokens/trainable': 1183173376, 'epoch': '2.888'}
 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████          | 1686/1751 [28:10:16<1:05:29, 60.45s/it] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 1687/1751 [28:11:17<1:04:40, 60.63s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5108', 'grad_norm': '0.167', 'learning_rate': '7.52e-08', 'ppl': '1.667', 'memory/max_active (GiB)': '77.91', 'memory/max_allocated (GiB)': '77.91', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '36.51', 'tokens/total': 3201357312, 'tokens/trainable': 1183889664, 'epoch': '2.889'}
 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 1687/1751 [28:11:17<1:04:40, 60.63s/it] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎         | 1688/1751 [28:12:17<1:03:23, 60.37s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4923', 'grad_norm': '0.1719', 'learning_rate': '7.291e-08', 'ppl': '1.636', 'memory/max_active (GiB)': '75.97', 'memory/max_allocated (GiB)': '75.97', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '110.9', 'tokens/total': 3203295488, 'tokens/trainable': 1184589440, 'epoch': '2.891'}
 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎         | 1688/1751 [28:12:17<1:03:23, 60.37s/it] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌         | 1689/1751 [28:13:17<1:02:09, 60.15s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5151', 'grad_norm': '0.1758', 'learning_rate': '7.065e-08', 'ppl': '1.674', 'memory/max_active (GiB)': '71.07', 'memory/max_allocated (GiB)': '71.07', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '103.9', 'tokens/total': 3205209344, 'tokens/trainable': 1185297792, 'epoch': '2.893'}
 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌         | 1689/1751 [28:13:17<1:02:09, 60.15s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 1690/1751 [28:14:16<1:00:48, 59.82s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5464', 'grad_norm': '0.1895', 'learning_rate': '6.843e-08', 'ppl': '1.727', 'memory/max_active (GiB)': '75.89', 'memory/max_allocated (GiB)': '75.89', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '57.47', 'tokens/total': 3207113472, 'tokens/trainable': 1185996416, 'epoch': '2.894'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 1690/1751 [28:14:16<1:00:48, 59.82s/it] 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 1691/1751 [28:15:12<58:44, 58.74s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5067', 'grad_norm': '0.1953', 'learning_rate': '6.624e-08', 'ppl': '1.66', 'memory/max_active (GiB)': '75.43', 'memory/max_allocated (GiB)': '75.43', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '84.57', 'tokens/total': 3208852480, 'tokens/trainable': 1186620928, 'epoch': '2.896'}
 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 1691/1751 [28:15:12<58:44, 58.74s/it] 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 1692/1751 [28:16:13<58:35, 59.58s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5036', 'grad_norm': '0.1729', 'learning_rate': '6.409e-08', 'ppl': '1.655', 'memory/max_active (GiB)': '74.72', 'memory/max_allocated (GiB)': '74.72', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '140.5', 'tokens/total': 3210812416, 'tokens/trainable': 1187372416, 'epoch': '2.898'}
 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 1692/1751 [28:16:13<58:35, 59.58s/it] 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████         | 1693/1751 [28:17:11<57:05, 59.07s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.553', 'grad_norm': '0.1865', 'learning_rate': '6.198e-08', 'ppl': '1.739', 'memory/max_active (GiB)': '68.88', 'memory/max_allocated (GiB)': '68.88', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '81.37', 'tokens/total': 3212615424, 'tokens/trainable': 1188033920, 'epoch': '2.9'}
 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████         | 1693/1751 [28:17:11<57:05, 59.07s/it] 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏        | 1694/1751 [28:18:12<56:28, 59.44s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5148', 'grad_norm': '0.1758', 'learning_rate': '5.989e-08', 'ppl': '1.673', 'memory/max_active (GiB)': '71.72', 'memory/max_allocated (GiB)': '71.72', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '69.12', 'tokens/total': 3214504960, 'tokens/trainable': 1188758144, 'epoch': '2.901'}
 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏        | 1694/1751 [28:18:12<56:28, 59.44s/it] 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 1695/1751 [28:19:12<55:42, 59.69s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5214', 'grad_norm': '0.1758', 'learning_rate': '5.785e-08', 'ppl': '1.684', 'memory/max_active (GiB)': '74.39', 'memory/max_allocated (GiB)': '74.39', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '41.08', 'tokens/total': 3216431872, 'tokens/trainable': 1189491840, 'epoch': '2.903'}
 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 1695/1751 [28:19:12<55:42, 59.69s/it] 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌        | 1696/1751 [28:20:13<55:06, 60.12s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5406', 'grad_norm': '0.1943', 'learning_rate': '5.584e-08', 'ppl': '1.717', 'memory/max_active (GiB)': '74.95', 'memory/max_allocated (GiB)': '74.95', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '49.97', 'tokens/total': 3218279680, 'tokens/trainable': 1190142976, 'epoch': '2.905'}
 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌        | 1696/1751 [28:20:13<55:06, 60.12s/it] 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 1697/1751 [28:21:14<54:28, 60.54s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5404', 'grad_norm': '0.1865', 'learning_rate': '5.386e-08', 'ppl': '1.717', 'memory/max_active (GiB)': '77.07', 'memory/max_allocated (GiB)': '77.07', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '26.29', 'tokens/total': 3220192000, 'tokens/trainable': 1190866432, 'epoch': '2.906'}
 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 1697/1751 [28:21:14<54:28, 60.54s/it] 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊        | 1698/1751 [28:22:15<53:26, 60.51s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4891', 'grad_norm': '0.1826', 'learning_rate': '5.192e-08', 'ppl': '1.631', 'memory/max_active (GiB)': '75.49', 'memory/max_allocated (GiB)': '75.49', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '31.57', 'tokens/total': 3222135040, 'tokens/trainable': 1191554688, 'epoch': '2.908'}
 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊        | 1698/1751 [28:22:15<53:26, 60.51s/it] 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 1699/1751 [28:23:14<51:59, 59.98s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5587', 'grad_norm': '0.1943', 'learning_rate': '5.002e-08', 'ppl': '1.748', 'memory/max_active (GiB)': '73.49', 'memory/max_allocated (GiB)': '73.49', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '90.36', 'tokens/total': 3223999488, 'tokens/trainable': 1192205184, 'epoch': '2.91'}
 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 1699/1751 [28:23:14<51:59, 59.98s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 1700/1751 [28:24:12<50:33, 59.48s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5478', 'grad_norm': '0.1797', 'learning_rate': '4.815e-08', 'ppl': '1.729', 'memory/max_active (GiB)': '74.09', 'memory/max_allocated (GiB)': '74.09', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '52.89', 'tokens/total': 3225827328, 'tokens/trainable': 1192865024, 'epoch': '2.912'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 1700/1751 [28:24:12<50:33, 59.48s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 1701/1751 [28:25:13<49:57, 59.94s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5183', 'grad_norm': '0.1768', 'learning_rate': '4.632e-08', 'ppl': '1.679', 'memory/max_active (GiB)': '74.41', 'memory/max_allocated (GiB)': '74.41', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '174.1', 'tokens/total': 3227791104, 'tokens/trainable': 1193565440, 'epoch': '2.913'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 1701/1751 [28:25:13<49:57, 59.94s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 1702/1751 [28:26:13<48:55, 59.90s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5263', 'grad_norm': '0.1787', 'learning_rate': '4.452e-08', 'ppl': '1.693', 'memory/max_active (GiB)': '74.61', 'memory/max_allocated (GiB)': '74.61', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '87.69', 'tokens/total': 3229651200, 'tokens/trainable': 1194279040, 'epoch': '2.915'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 1702/1751 [28:26:13<48:55, 59.90s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 1703/1751 [28:27:11<47:35, 59.48s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5169', 'grad_norm': '0.1885', 'learning_rate': '4.276e-08', 'ppl': '1.677', 'memory/max_active (GiB)': '68.22', 'memory/max_allocated (GiB)': '68.22', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '81.86', 'tokens/total': 3231496704, 'tokens/trainable': 1194921344, 'epoch': '2.917'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 1703/1751 [28:27:11<47:35, 59.48s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 1704/1751 [28:28:09<46:07, 58.89s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5706', 'grad_norm': '0.2002', 'learning_rate': '4.103e-08', 'ppl': '1.769', 'memory/max_active (GiB)': '66.78', 'memory/max_allocated (GiB)': '66.78', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '116.9', 'tokens/total': 3233232896, 'tokens/trainable': 1195559936, 'epoch': '2.918'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 1704/1751 [28:28:09<46:07, 58.89s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉       | 1705/1751 [28:29:10<45:35, 59.47s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5472', 'grad_norm': '0.1748', 'learning_rate': '3.934e-08', 'ppl': '1.728', 'memory/max_active (GiB)': '76.94', 'memory/max_allocated (GiB)': '76.94', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '12.35', 'tokens/total': 3235153408, 'tokens/trainable': 1196256640, 'epoch': '2.92'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉       | 1705/1751 [28:29:10<45:35, 59.47s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████       | 1706/1751 [28:30:08<44:21, 59.14s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5253', 'grad_norm': '0.1758', 'learning_rate': '3.769e-08', 'ppl': '1.691', 'memory/max_active (GiB)': '73.56', 'memory/max_allocated (GiB)': '73.56', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '123.9', 'tokens/total': 3237045760, 'tokens/trainable': 1196939264, 'epoch': '2.922'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████       | 1706/1751 [28:30:08<44:21, 59.14s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏      | 1707/1751 [28:31:07<43:16, 59.01s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4929', 'grad_norm': '0.1709', 'learning_rate': '3.607e-08', 'ppl': '1.637', 'memory/max_active (GiB)': '73.92', 'memory/max_allocated (GiB)': '73.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '134.6', 'tokens/total': 3238885376, 'tokens/trainable': 1197630720, 'epoch': '2.924'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏      | 1707/1751 [28:31:07<43:16, 59.01s/it] 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 1708/1751 [28:32:05<42:11, 58.86s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5102', 'grad_norm': '0.168', 'learning_rate': '3.448e-08', 'ppl': '1.666', 'memory/max_active (GiB)': '73.01', 'memory/max_allocated (GiB)': '73.01', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '149.6', 'tokens/total': 3240730880, 'tokens/trainable': 1198356864, 'epoch': '2.925'}
 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 1708/1751 [28:32:05<42:11, 58.86s/it] 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌      | 1709/1751 [28:33:06<41:32, 59.34s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5097', 'grad_norm': '0.1709', 'learning_rate': '3.294e-08', 'ppl': '1.665', 'memory/max_active (GiB)': '73.13', 'memory/max_allocated (GiB)': '73.13', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '104.9', 'tokens/total': 3242660096, 'tokens/trainable': 1199060992, 'epoch': '2.927'}
 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌      | 1709/1751 [28:33:06<41:32, 59.34s/it] 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋      | 1710/1751 [28:34:07<40:54, 59.86s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.514', 'grad_norm': '0.1826', 'learning_rate': '3.142e-08', 'ppl': '1.672', 'memory/max_active (GiB)': '71.5', 'memory/max_allocated (GiB)': '71.5', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '66.38', 'tokens/total': 3244596480, 'tokens/trainable': 1199790976, 'epoch': '2.929'}
 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋      | 1710/1751 [28:34:07<40:54, 59.86s/it] 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊      | 1711/1751 [28:35:09<40:27, 60.70s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4849', 'grad_norm': '0.1602', 'learning_rate': '2.994e-08', 'ppl': '1.624', 'memory/max_active (GiB)': '75.58', 'memory/max_allocated (GiB)': '75.58', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '90.43', 'tokens/total': 3246591488, 'tokens/trainable': 1200552192, 'epoch': '2.93'}
 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊      | 1711/1751 [28:35:09<40:27, 60.70s/it] 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 1712/1751 [28:36:10<39:24, 60.63s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5052', 'grad_norm': '0.1826', 'learning_rate': '2.85e-08', 'ppl': '1.657', 'memory/max_active (GiB)': '74.55', 'memory/max_allocated (GiB)': '74.55', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '124.3', 'tokens/total': 3248510208, 'tokens/trainable': 1201255168, 'epoch': '2.932'}
 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 1712/1751 [28:36:10<39:24, 60.63s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 1713/1751 [28:37:11<38:31, 60.83s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4842', 'grad_norm': '0.1797', 'learning_rate': '2.71e-08', 'ppl': '1.623', 'memory/max_active (GiB)': '76.38', 'memory/max_allocated (GiB)': '76.38', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '108.6', 'tokens/total': 3250473728, 'tokens/trainable': 1201975808, 'epoch': '2.934'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 1713/1751 [28:37:11<38:31, 60.83s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 1714/1751 [28:38:12<37:28, 60.77s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5135', 'grad_norm': '0.1719', 'learning_rate': '2.572e-08', 'ppl': '1.671', 'memory/max_active (GiB)': '76.64', 'memory/max_allocated (GiB)': '76.64', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '39.94', 'tokens/total': 3252402688, 'tokens/trainable': 1202691712, 'epoch': '2.936'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 1714/1751 [28:38:12<37:28, 60.77s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 1715/1751 [28:39:13<36:27, 60.78s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.539', 'grad_norm': '0.1885', 'learning_rate': '2.439e-08', 'ppl': '1.714', 'memory/max_active (GiB)': '73.73', 'memory/max_allocated (GiB)': '73.73', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '101.7', 'tokens/total': 3254336256, 'tokens/trainable': 1203421824, 'epoch': '2.937'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 1715/1751 [28:39:13<36:27, 60.78s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 1716/1751 [28:40:12<35:17, 60.50s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4869', 'grad_norm': '0.1709', 'learning_rate': '2.309e-08', 'ppl': '1.627', 'memory/max_active (GiB)': '74.01', 'memory/max_allocated (GiB)': '74.01', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '114.5', 'tokens/total': 3256210688, 'tokens/trainable': 1204108032, 'epoch': '2.939'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 1716/1751 [28:40:12<35:17, 60.50s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 1717/1751 [28:41:14<34:29, 60.86s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5228', 'grad_norm': '0.1855', 'learning_rate': '2.182e-08', 'ppl': '1.687', 'memory/max_active (GiB)': '74.57', 'memory/max_allocated (GiB)': '74.57', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '159.2', 'tokens/total': 3258148864, 'tokens/trainable': 1204849536, 'epoch': '2.941'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 1717/1751 [28:41:14<34:29, 60.86s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 1718/1751 [28:42:12<32:56, 59.90s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5359', 'grad_norm': '0.1865', 'learning_rate': '2.06e-08', 'ppl': '1.709', 'memory/max_active (GiB)': '76.24', 'memory/max_allocated (GiB)': '76.24', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '102.3', 'tokens/total': 3259955200, 'tokens/trainable': 1205513984, 'epoch': '2.942'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 1718/1751 [28:42:12<32:56, 59.90s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████     | 1719/1751 [28:43:09<31:33, 59.18s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5216', 'grad_norm': '0.1816', 'learning_rate': '1.94e-08', 'ppl': '1.685', 'memory/max_active (GiB)': '76.47', 'memory/max_allocated (GiB)': '76.47', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '101.1', 'tokens/total': 3261788928, 'tokens/trainable': 1206190848, 'epoch': '2.944'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████     | 1719/1751 [28:43:09<31:33, 59.18s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 1720/1751 [28:44:12<31:09, 60.32s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5118', 'grad_norm': '0.1709', 'learning_rate': '1.824e-08', 'ppl': '1.668', 'memory/max_active (GiB)': '76.36', 'memory/max_allocated (GiB)': '76.36', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '33.64', 'tokens/total': 3263797504, 'tokens/trainable': 1206920320, 'epoch': '2.946'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏    | 1720/1751 [28:44:12<31:09, 60.32s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 1721/1751 [28:45:15<30:28, 60.96s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5389', 'grad_norm': '0.1787', 'learning_rate': '1.712e-08', 'ppl': '1.714', 'memory/max_active (GiB)': '76.52', 'memory/max_allocated (GiB)': '76.52', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '134.2', 'tokens/total': 3265801472, 'tokens/trainable': 1207651328, 'epoch': '2.948'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 1721/1751 [28:45:15<30:28, 60.96s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌    | 1722/1751 [28:46:15<29:23, 60.81s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5605', 'grad_norm': '0.1875', 'learning_rate': '1.604e-08', 'ppl': '1.751', 'memory/max_active (GiB)': '73.32', 'memory/max_allocated (GiB)': '73.32', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '69.71', 'tokens/total': 3267677184, 'tokens/trainable': 1208365056, 'epoch': '2.949'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌    | 1722/1751 [28:46:15<29:23, 60.81s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 1723/1751 [28:47:16<28:22, 60.82s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5317', 'grad_norm': '0.1787', 'learning_rate': '1.498e-08', 'ppl': '1.702', 'memory/max_active (GiB)': '74.92', 'memory/max_allocated (GiB)': '74.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '26', 'tokens/total': 3269555456, 'tokens/trainable': 1209059328, 'epoch': '2.951'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 1723/1751 [28:47:16<28:22, 60.82s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊    | 1724/1751 [28:48:17<27:25, 60.95s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5121', 'grad_norm': '0.1768', 'learning_rate': '1.397e-08', 'ppl': '1.669', 'memory/max_active (GiB)': '71.83', 'memory/max_allocated (GiB)': '71.83', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '121.5', 'tokens/total': 3271480320, 'tokens/trainable': 1209786496, 'epoch': '2.953'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊    | 1724/1751 [28:48:17<27:25, 60.95s/it] 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉    | 1725/1751 [28:49:16<26:07, 60.29s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5259', 'grad_norm': '0.1748', 'learning_rate': '1.299e-08', 'ppl': '1.692', 'memory/max_active (GiB)': '75.3', 'memory/max_allocated (GiB)': '75.3', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '80.02', 'tokens/total': 3273324032, 'tokens/trainable': 1210464384, 'epoch': '2.954'}
 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉    | 1725/1751 [28:49:16<26:07, 60.29s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏   | 1726/1751 [28:50:16<25:04, 60.19s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5251', 'grad_norm': '0.1865', 'learning_rate': '1.205e-08', 'ppl': '1.691', 'memory/max_active (GiB)': '70.92', 'memory/max_allocated (GiB)': '70.92', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '32.92', 'tokens/total': 3275206400, 'tokens/trainable': 1211158656, 'epoch': '2.956'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏   | 1726/1751 [28:50:16<25:04, 60.19s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 1727/1751 [28:51:16<24:04, 60.19s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5166', 'grad_norm': '0.1709', 'learning_rate': '1.114e-08', 'ppl': '1.676', 'memory/max_active (GiB)': '77.26', 'memory/max_allocated (GiB)': '77.26', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '67.12', 'tokens/total': 3277091072, 'tokens/trainable': 1211838208, 'epoch': '2.958'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎   | 1727/1751 [28:51:16<24:04, 60.19s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍   | 1728/1751 [28:52:16<23:03, 60.15s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5433', 'grad_norm': '0.1855', 'learning_rate': '1.026e-08', 'ppl': '1.722', 'memory/max_active (GiB)': '69.71', 'memory/max_allocated (GiB)': '69.71', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '119.5', 'tokens/total': 3278969344, 'tokens/trainable': 1212518656, 'epoch': '2.96'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍   | 1728/1751 [28:52:16<23:03, 60.15s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 1729/1751 [28:53:16<22:01, 60.08s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5263', 'grad_norm': '0.1787', 'learning_rate': '9.427e-09', 'ppl': '1.693', 'memory/max_active (GiB)': '76.07', 'memory/max_allocated (GiB)': '76.07', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '148.8', 'tokens/total': 3280864256, 'tokens/trainable': 1213210112, 'epoch': '2.961'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 1729/1751 [28:53:16<22:01, 60.08s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 1730/1751 [28:54:16<21:02, 60.11s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4953', 'grad_norm': '0.1748', 'learning_rate': '8.625e-09', 'ppl': '1.641', 'memory/max_active (GiB)': '74.33', 'memory/max_allocated (GiB)': '74.33', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '95.38', 'tokens/total': 3282804992, 'tokens/trainable': 1213929600, 'epoch': '2.963'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 1730/1751 [28:54:16<21:02, 60.11s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉   | 1731/1751 [28:55:16<20:00, 60.01s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4928', 'grad_norm': '0.1729', 'learning_rate': '7.859e-09', 'ppl': '1.637', 'memory/max_active (GiB)': '73.38', 'memory/max_allocated (GiB)': '73.38', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '150.9', 'tokens/total': 3284737792, 'tokens/trainable': 1214667648, 'epoch': '2.965'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉   | 1731/1751 [28:55:16<20:00, 60.01s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████   | 1732/1751 [28:56:16<19:01, 60.09s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.4907', 'grad_norm': '0.1758', 'learning_rate': '7.128e-09', 'ppl': '1.633', 'memory/max_active (GiB)': '74.26', 'memory/max_allocated (GiB)': '74.26', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '52.32', 'tokens/total': 3286704384, 'tokens/trainable': 1215383424, 'epoch': '2.966'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████   | 1732/1751 [28:56:16<19:01, 60.09s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 1733/1751 [28:57:16<17:57, 59.84s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5261', 'grad_norm': '0.1807', 'learning_rate': '6.433e-09', 'ppl': '1.692', 'memory/max_active (GiB)': '73.33', 'memory/max_allocated (GiB)': '73.33', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '268.6', 'tokens/total': 3288580096, 'tokens/trainable': 1216046848, 'epoch': '2.968'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 1733/1751 [28:57:16<17:57, 59.84s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 1734/1751 [28:58:15<16:55, 59.73s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5393', 'grad_norm': '0.1855', 'learning_rate': '5.774e-09', 'ppl': '1.715', 'memory/max_active (GiB)': '72.9', 'memory/max_allocated (GiB)': '72.9', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '10.92', 'tokens/total': 3290411776, 'tokens/trainable': 1216722816, 'epoch': '2.97'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 1734/1751 [28:58:15<16:55, 59.73s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 1735/1751 [28:59:13<15:45, 59.11s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5695', 'grad_norm': '0.1943', 'learning_rate': '5.15e-09', 'ppl': '1.767', 'memory/max_active (GiB)': '73.66', 'memory/max_allocated (GiB)': '73.66', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '162.3', 'tokens/total': 3292215040, 'tokens/trainable': 1217403520, 'epoch': '2.972'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 1735/1751 [28:59:13<15:45, 59.11s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋  | 1736/1751 [29:00:14<14:58, 59.88s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5747', 'grad_norm': '0.1914', 'learning_rate': '4.562e-09', 'ppl': '1.777', 'memory/max_active (GiB)': '75.17', 'memory/max_allocated (GiB)': '75.17', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '65.98', 'tokens/total': 3294207232, 'tokens/trainable': 1218107264, 'epoch': '2.973'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋  | 1736/1751 [29:00:14<14:58, 59.88s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 1737/1751 [29:01:14<13:56, 59.72s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5102', 'grad_norm': '0.1719', 'learning_rate': '4.01e-09', 'ppl': '1.666', 'memory/max_active (GiB)': '75.62', 'memory/max_allocated (GiB)': '75.62', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '24.61', 'tokens/total': 3296095744, 'tokens/trainable': 1218824320, 'epoch': '2.975'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 1737/1751 [29:01:14<13:56, 59.72s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 1738/1751 [29:02:14<12:58, 59.91s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5629', 'grad_norm': '0.1846', 'learning_rate': '3.493e-09', 'ppl': '1.756', 'memory/max_active (GiB)': '76.63', 'memory/max_allocated (GiB)': '76.63', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '43.04', 'tokens/total': 3298022656, 'tokens/trainable': 1219544704, 'epoch': '2.977'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 1738/1751 [29:02:14<12:58, 59.91s/it] 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 1739/1751 [29:03:14<11:59, 59.93s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5358', 'grad_norm': '0.1787', 'learning_rate': '3.012e-09', 'ppl': '1.709', 'memory/max_active (GiB)': '74.32', 'memory/max_allocated (GiB)': '74.32', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '42.73', 'tokens/total': 3299904000, 'tokens/trainable': 1220232320, 'epoch': '2.978'}
 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 1739/1751 [29:03:14<11:59, 59.93s/it] 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 1740/1751 [29:04:12<10:53, 59.43s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5908', 'grad_norm': '0.1895', 'learning_rate': '2.566e-09', 'ppl': '1.805', 'memory/max_active (GiB)': '69.28', 'memory/max_allocated (GiB)': '69.28', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '87.5', 'tokens/total': 3301704704, 'tokens/trainable': 1220893696, 'epoch': '2.98'}
 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 1740/1751 [29:04:12<10:53, 59.43s/it] 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 1741/1751 [29:05:13<09:56, 59.67s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5281', 'grad_norm': '0.1846', 'learning_rate': '2.156e-09', 'ppl': '1.696', 'memory/max_active (GiB)': '73.86', 'memory/max_allocated (GiB)': '73.86', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '70.61', 'tokens/total': 3303644416, 'tokens/trainable': 1221615616, 'epoch': '2.982'}
 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 1741/1751 [29:05:13<09:56, 59.67s/it] 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 1742/1751 [29:06:10<08:51, 59.04s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.509', 'grad_norm': '0.1689', 'learning_rate': '1.782e-09', 'ppl': '1.664', 'memory/max_active (GiB)': '75.26', 'memory/max_allocated (GiB)': '75.26', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '225.9', 'tokens/total': 3305431808, 'tokens/trainable': 1222281600, 'epoch': '2.984'}
 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 1742/1751 [29:06:10<08:51, 59.04s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 1743/1751 [29:07:10<07:54, 59.32s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5028', 'grad_norm': '0.1748', 'learning_rate': '1.444e-09', 'ppl': '1.653', 'memory/max_active (GiB)': '75.56', 'memory/max_allocated (GiB)': '75.56', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '82.7', 'tokens/total': 3307300608, 'tokens/trainable': 1222972672, 'epoch': '2.985'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 1743/1751 [29:07:10<07:54, 59.32s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 1744/1751 [29:08:09<06:53, 59.11s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5317', 'grad_norm': '0.1982', 'learning_rate': '1.141e-09', 'ppl': '1.702', 'memory/max_active (GiB)': '68.69', 'memory/max_allocated (GiB)': '68.69', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '162.6', 'tokens/total': 3309084928, 'tokens/trainable': 1223616000, 'epoch': '2.987'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 1744/1751 [29:08:09<06:53, 59.11s/it]100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 1745/1751 [29:09:09<05:56, 59.38s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5071', 'grad_norm': '0.1885', 'learning_rate': '8.733e-10', 'ppl': '1.66', 'memory/max_active (GiB)': '77.33', 'memory/max_allocated (GiB)': '77.33', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '55.7', 'tokens/total': 3310921216, 'tokens/trainable': 1224303872, 'epoch': '2.989'}
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 1745/1751 [29:09:09<05:56, 59.38s/it]100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 1746/1751 [29:10:07<04:54, 58.93s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5265', 'grad_norm': '0.1963', 'learning_rate': '6.416e-10', 'ppl': '1.693', 'memory/max_active (GiB)': '76.12', 'memory/max_allocated (GiB)': '76.12', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '98.99', 'tokens/total': 3312750336, 'tokens/trainable': 1224930560, 'epoch': '2.99'}
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 1746/1751 [29:10:07<04:54, 58.93s/it]100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍| 1747/1751 [29:11:05<03:55, 58.78s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5498', 'grad_norm': '0.1826', 'learning_rate': '4.456e-10', 'ppl': '1.733', 'memory/max_active (GiB)': '74.75', 'memory/max_allocated (GiB)': '74.75', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '101', 'tokens/total': 3314560256, 'tokens/trainable': 1225597312, 'epoch': '2.992'}
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍| 1747/1751 [29:11:05<03:55, 58.78s/it]100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 1748/1751 [29:12:06<02:57, 59.26s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.495', 'grad_norm': '0.1709', 'learning_rate': '2.852e-10', 'ppl': '1.64', 'memory/max_active (GiB)': '76.75', 'memory/max_allocated (GiB)': '76.75', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '64.02', 'tokens/total': 3316440320, 'tokens/trainable': 1226302208, 'epoch': '2.994'}
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 1748/1751 [29:12:06<02:57, 59.26s/it]100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 1749/1751 [29:13:05<01:58, 59.31s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.5223', 'grad_norm': '0.1826', 'learning_rate': '1.604e-10', 'ppl': '1.686', 'memory/max_active (GiB)': '74.19', 'memory/max_allocated (GiB)': '74.19', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '69.84', 'tokens/total': 3318282240, 'tokens/trainable': 1226980608, 'epoch': '2.996'}
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 1749/1751 [29:13:05<01:58, 59.31s/it]100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 1750/1751 [29:14:06<00:59, 59.89s/it]                                                                                                                                                                                                                                                                                                                          {'loss': '0.495', 'grad_norm': '0.1709', 'learning_rate': '7.129e-11', 'ppl': '1.64', 'memory/max_active (GiB)': '69.63', 'memory/max_allocated (GiB)': '69.63', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '86.13', 'tokens/total': 3320205056, 'tokens/trainable': 1227721728, 'epoch': '2.997'}
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 1750/1751 [29:14:06<00:59, 59.89s/it]100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1751/1751 [29:15:10<00:00, 60.94s/it]                                                                                                                                                                                                                                                                                                                                         {'loss': '0.4682', 'grad_norm': '0.1562', 'learning_rate': '1.782e-11', 'ppl': '1.597', 'memory/max_active (GiB)': '75.61', 'memory/max_allocated (GiB)': '75.61', 'memory/device_reserved (GiB)': '112.2', 'tokens/train_per_sec_per_gpu': '60.35', 'tokens/total': 3322252544, 'tokens/trainable': 1228497536, 'epoch': '2.999'}
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1751/1751 [29:15:10<00:00, 60.94s/it][2026-02-05 08:38:37,283] [INFO] [axolotl.core.trainers.base._save:721] [PID:23602] Saving model checkpoint to ./outputs/checkpoint-1751

Writing model shards:   0%|                                                                                                                                                                                                                                                                                        | 0/1 [00:00<?, ?it/s][A
Writing model shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.69s/it][AWriting model shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.69s/it]
                                                                                                                                                                                                                                                                                                                                         {'train_runtime': '1.053e+05', 'train_samples_per_second': '4.256', 'train_steps_per_second': '0.017', 'train_loss': '0.5364', 'memory/max_active (GiB)': '8.75', 'memory/max_allocated (GiB)': '8.75', 'memory/device_reserved (GiB)': '112.2', 'epoch': '2.999', 'tokens/train_per_sec_per_gpu': '0'}
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1751/1751 [29:15:26<00:00, 60.94s/it]100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1751/1751 [29:15:26<00:00, 60.15s/it]
[2026-02-05 08:39:38,109] [INFO] [axolotl.train.save_trained_model:226] [PID:23602] Training completed! Saving trained model to ./outputs.
Writing model shards:   0%|                                                                                                                                                                                                                                                                                        | 0/1 [00:00<?, ?it/s]Writing model shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.33s/it]Writing model shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.33s/it]
[2026-02-05 08:39:45,497] [INFO] [axolotl.train.save_trained_model:340] [PID:23602] Model successfully saved to ./outputs
[2026-02-05 08:39:45,626] [INFO] [axolotl.core.trainers.base._save:721] [PID:23602] Saving model checkpoint to ./outputs
Writing model shards:   0%|                                                                                                                                                                                                                                                                                        | 0/1 [00:00<?, ?it/s]Writing model shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.37s/it]Writing model shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.37s/it]
Processing Files (0 / 0)      : |                                                                                                                                                                                                                                                                           |  0.00B /  0.00B            
New Data Upload               : |                                                                                                                                                                                                                                                                           |  0.00B /  0.00B            [A

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:   1%|██▊                                                                                                                                                                                                                                                                    | 33.5MB / 3.09GB            [A[A[A[A

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:   1%|██▊                                                                                                                                                                                                                                                                    | 33.5MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :   1%|███▊                                                                                                                                                                                                                                                                   | 45.0MB / 3.10GB,   ???B/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:   2%|█████▋                                                                                                                                                                                                                                                                 | 67.1MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :   3%|██████▋                                                                                                                                                                                                                                                                | 78.5MB / 3.10GB,  168MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:   3%|███████▊                                                                                                                                                                                                                                                               | 92.3MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :   3%|████████▊                                                                                                                                                                                                                                                              |  104MB / 3.10GB,  147MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:   4%|██████████▋                                                                                                                                                                                                                                                            |  126MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :   4%|███████████▋                                                                                                                                                                                                                                                           |  137MB / 3.10GB,  154MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:   5%|████████████▊                                                                                                                                                                                                                                                          |  151MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :   5%|█████████████▊                                                                                                                                                                                                                                                         |  162MB / 3.10GB,  147MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:   6%|███████████████▋                                                                                                                                                                                                                                                       |  185MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :   6%|████████████████▋                                                                                                                                                                                                                                                      |  196MB / 3.10GB,  151MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:   7%|█████████████████▊                                                                                                                                                                                                                                                     |  210MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :   7%|██████████████████▊                                                                                                                                                                                                                                                    |  221MB / 3.10GB,  147MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:   8%|████████████████████▋                                                                                                                                                                                                                                                  |  243MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :   8%|█████████████████████▌                                                                                                                                                                                                                                                 |  255MB / 3.10GB,  150MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:   9%|██████████████████████▊                                                                                                                                                                                                                                                |  268MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :   9%|███████████████████████▋                                                                                                                                                                                                                                               |  280MB / 3.10GB,  147MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  10%|█████████████████████████▋                                                                                                                                                                                                                                             |  302MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  10%|██████████████████████████▌                                                                                                                                                                                                                                            |  313MB / 3.10GB,  149MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  11%|████████████████████████████▌                                                                                                                                                                                                                                          |  336MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  11%|█████████████████████████████▍                                                                                                                                                                                                                                         |  347MB / 3.10GB,  151MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  12%|██████████████████████████████▋                                                                                                                                                                                                                                        |  361MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  12%|███████████████████████████████▌                                                                                                                                                                                                                                       |  372MB / 3.10GB,  149MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  13%|█████████████████████████████████▌                                                                                                                                                                                                                                     |  394MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  13%|██████████████████████████████████▍                                                                                                                                                                                                                                    |  406MB / 3.10GB,  150MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  14%|███████████████████████████████████▋                                                                                                                                                                                                                                   |  419MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  14%|████████████████████████████████████▌                                                                                                                                                                                                                                  |  431MB / 3.10GB,  148MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  15%|██████████████████████████████████████▌                                                                                                                                                                                                                                |  453MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  15%|███████████████████████████████████████▍                                                                                                                                                                                                                               |  464MB / 3.10GB,  150MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  15%|████████████████████████████████████████▋                                                                                                                                                                                                                              |  478MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  16%|█████████████████████████████████████████▌                                                                                                                                                                                                                             |  490MB / 3.10GB,  148MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  17%|███████████████████████████████████████████▌                                                                                                                                                                                                                           |  512MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  17%|████████████████████████████████████████████▍                                                                                                                                                                                                                          |  523MB / 3.10GB,  149MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  17%|█████████████████████████████████████████████▋                                                                                                                                                                                                                         |  537MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  18%|██████████████████████████████████████████████▌                                                                                                                                                                                                                        |  548MB / 3.10GB,  148MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  18%|████████████████████████████████████████████████▌                                                                                                                                                                                                                      |  570MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  19%|█████████████████████████████████████████████████▍                                                                                                                                                                                                                     |  582MB / 3.10GB,  149MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  19%|██████████████████████████████████████████████████▋                                                                                                                                                                                                                    |  596MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  20%|███████████████████████████████████████████████████▌                                                                                                                                                                                                                   |  607MB / 3.10GB,  148MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  20%|█████████████████████████████████████████████████████▌                                                                                                                                                                                                                 |  629MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  21%|██████████████████████████████████████████████████████▎                                                                                                                                                                                                                |  641MB / 3.10GB,  149MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  21%|███████████████████████████████████████████████████████▋                                                                                                                                                                                                               |  654MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  21%|████████████████████████████████████████████████████████▍                                                                                                                                                                                                              |  666MB / 3.10GB,  148MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  22%|██████████████████████████████████████████████████████████▌                                                                                                                                                                                                            |  688MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  23%|███████████████████████████████████████████████████████████▎                                                                                                                                                                                                           |  699MB / 3.10GB,  149MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  23%|████████████████████████████████████████████████████████████▋                                                                                                                                                                                                          |  713MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  23%|█████████████████████████████████████████████████████████████▍                                                                                                                                                                                                         |  724MB / 3.10GB,  148MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  24%|███████████████████████████████████████████████████████████████▌                                                                                                                                                                                                       |  746MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  24%|████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                      |  758MB / 3.10GB,  149MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  25%|█████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                     |  772MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  25%|██████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                    |  783MB / 3.10GB,  148MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  26%|████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                  |  805MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  26%|█████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                 |  817MB / 3.10GB,  148MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  27%|██████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                |  830MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  27%|███████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                               |  842MB / 3.10GB,  148MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  28%|█████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                             |  864MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  28%|██████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                            |  875MB / 3.10GB,  148MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  29%|███████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                           |  889MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  29%|████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                          |  901MB / 3.10GB,  147MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  30%|█████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                         |  914MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  30%|██████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                        |  926MB / 3.10GB,  147MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  30%|████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                       |  940MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  31%|████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                      |  951MB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  32%|██████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                    |  973MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  32%|███████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                   |  985MB / 3.10GB,  147MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  32%|█████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                  |  998MB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  33%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                 | 1.01GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  33%|███████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                               | 1.03GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  34%|████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                              | 1.04GB / 3.10GB,  147MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  35%|██████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                            | 1.07GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  35%|███████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                           | 1.08GB / 3.10GB,  147MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  35%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                          | 1.09GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  36%|█████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                         | 1.10GB / 3.10GB,  147MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  36%|███████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                        | 1.12GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  36%|███████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                       | 1.13GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  37%|█████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                     | 1.15GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  37%|██████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                    | 1.16GB / 3.10GB,  147MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  38%|████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                   | 1.17GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  38%|████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                  | 1.19GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  39%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                | 1.21GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  39%|███████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                               | 1.22GB / 3.10GB,  147MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  40%|█████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                              | 1.23GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  40%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                             | 1.24GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  41%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                           | 1.27GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  41%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                          | 1.28GB / 3.10GB,  147MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  42%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                         | 1.29GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  42%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                        | 1.30GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  43%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                      | 1.32GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  43%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                      | 1.33GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  44%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                    | 1.35GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  44%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                   | 1.36GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  45%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                 | 1.38GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  45%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                 | 1.39GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  46%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                               | 1.41GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  46%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                              | 1.42GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  46%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                            | 1.43GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  47%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                            | 1.45GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  47%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                          | 1.46GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  47%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                          | 1.47GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  48%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                       | 1.49GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  49%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                       | 1.50GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  49%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                     | 1.52GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  49%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                     | 1.53GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  50%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                  | 1.55GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  50%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                  | 1.56GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  51%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                | 1.58GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  51%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                | 1.59GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  52%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                             | 1.61GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  52%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                             | 1.62GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  53%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                           | 1.64GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  53%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                           | 1.65GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  54%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                        | 1.67GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  54%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                        | 1.68GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  55%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                      | 1.69GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  55%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                      | 1.71GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                   | 1.73GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                   | 1.74GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                 | 1.75GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                 | 1.76GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                              | 1.79GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                              | 1.80GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                            | 1.81GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                            | 1.82GB / 3.10GB,  145MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                         | 1.85GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                         | 1.86GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                       | 1.87GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                       | 1.88GB / 3.10GB,  145MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                    | 1.90GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                    | 1.92GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  62%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                  | 1.93GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                  | 1.94GB / 3.10GB,  145MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                               | 1.96GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                               | 1.97GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                             | 1.99GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                             | 2.00GB / 3.10GB,  145MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                          | 2.02GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  66%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                          | 2.03GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                        | 2.05GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                        | 2.06GB / 3.10GB,  145MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                     | 2.08GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                     | 2.09GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                   | 2.11GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                   | 2.12GB / 3.10GB,  145MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  69%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                | 2.14GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  69%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                | 2.15GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                              | 2.16GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                              | 2.18GB / 3.10GB,  145MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                           | 2.20GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                           | 2.21GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                         | 2.23GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                        | 2.24GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                      | 2.26GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                      | 2.27GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                    | 2.29GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                   | 2.30GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                 | 2.32GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 2.33GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                               | 2.35GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 2.36GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                            | 2.37GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                            | 2.39GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                          | 2.40GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                          | 2.41GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                       | 2.43GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                       | 2.44GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                     | 2.46GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                     | 2.47GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                  | 2.49GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                  | 2.50GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                | 2.52GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                | 2.53GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                             | 2.55GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 2.56GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                           | 2.58GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                           | 2.59GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                         | 2.60GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                         | 2.61GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                      | 2.63GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                      | 2.65GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                    | 2.66GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                    | 2.67GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                 | 2.69GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                 | 2.70GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                               | 2.72GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 2.73GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                            | 2.75GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                            | 2.76GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 2.78GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 2.79GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                        | 2.80GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                        | 2.81GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                     | 2.84GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                     | 2.85GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 2.86GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 2.87GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                | 2.89GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                | 2.91GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋              | 2.92GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋              | 2.93GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 2.95GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 2.96GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 2.98GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 2.99GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌      | 3.01GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌      | 3.02GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 3.04GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 3.05GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors:  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 3.07GB / 3.09GB            [A[A[A[AProcessing Files (2 / 3)      :  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 3.08GB / 3.10GB,  146MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.09GB / 3.09GB            [A[A[A[AProcessing Files (3 / 3)      : 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.10GB / 3.10GB,  145MB/s  

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.09GB / 3.09GB            [A[A[A[A

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.09GB / 3.09GB            [A[A[A[A

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.09GB / 3.09GB            [A[A[A[A

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.09GB / 3.09GB            [A[A[A[A

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.09GB / 3.09GB            [A[A[A[A

  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            [A[A


  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            [A[A[A


  ...outputs/model.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.09GB / 3.09GB            [A[A[A[AProcessing Files (3 / 3)      : 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.10GB / 3.10GB,  130MB/s  
New Data Upload               : |                                                                                                                                                                                                                                                                           |  0.00B /  0.00B,  0.00B/s  
  ...outputs/training_args.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.61kB / 6.61kB            
  ...tl/outputs/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.4MB / 11.4MB            
  ...outputs/model.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.09GB / 3.09GB