RayDu0010 commited on Aug 10, 2025

Commit

0496999

verified ·

1 Parent(s): 94aaa7f

Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

141_128_e3_3e-5/adapter_config.json +39 -0
141_128_e3_3e-5/adapter_model.safetensors +3 -0
141_128_e3_3e-5/latest +1 -0
141_128_e3_3e-5/merges.txt +0 -0
141_128_e3_3e-5/rng_state_0.pth +3 -0
141_128_e3_3e-5/rng_state_1.pth +3 -0
141_128_e3_3e-5/rng_state_2.pth +3 -0
141_128_e3_3e-5/rng_state_3.pth +3 -0
141_128_e3_3e-5/rng_state_4.pth +3 -0
141_128_e3_3e-5/rng_state_5.pth +3 -0
141_128_e3_3e-5/rng_state_6.pth +3 -0
141_128_e3_3e-5/rng_state_7.pth +3 -0
141_128_e3_3e-5/scheduler.pt +3 -0
141_128_e3_3e-5/special_tokens_map.json +45 -0
141_128_e3_3e-5/tokenizer.json +0 -0
141_128_e3_3e-5/tokenizer_config.json +188 -0
141_128_e3_3e-5/trainer_state.json +2365 -0
141_128_e3_3e-5/training_args.bin +3 -0
141_128_e3_3e-5/vocab.json +0 -0
141_128_e3_3e-5/zero_to_fp32.py +604 -0

141_128_e3_3e-5/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "ibm-granite/granite-3.3-8b-base",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "k_proj",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "o_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

141_128_e3_3e-5/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57e49c720f9be232a953f37ce73e0e0a4ba07598099de772a9402472fb05b96a
+size 791751704

141_128_e3_3e-5/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step1665

141_128_e3_3e-5/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

141_128_e3_3e-5/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc18311fd1a9d44675206a9fa3d745c4d9299bdba375c53dc972cb9a07673032
+size 16389

141_128_e3_3e-5/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:374b0ccef1c4ca445265b5e4a0ec2bee92ee5ef8c4be145023d4775634422807
+size 16389

141_128_e3_3e-5/rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c8ec4c77244a6ed4b740d126911e9eb07a250ca3aacbd4baf6541e7b1492746
+size 16389

141_128_e3_3e-5/rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8561dc5db6432d29b9ab489e017b8ec9b17a1796633bf9722289f9be79b69ef1
+size 16389

141_128_e3_3e-5/rng_state_4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78ca883a09eeb152bf749422c2c76a24120e4bc1b3c3df3ca9b7083a268f36c2
+size 16389

141_128_e3_3e-5/rng_state_5.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:850c0236d52f9b4404adb91616dfeaaa6b0dd9c0e48c872486370e685f0e381d
+size 16389

141_128_e3_3e-5/rng_state_6.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8727960f169c0b1f469d85b1d69c3f7d38f134c44d83b0c31cb32a91a8988f4
+size 16389

141_128_e3_3e-5/rng_state_7.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24a1972d2951b00a968941f1ea62aed487a8d67c4d814405cbffb6ecb5609351
+size 16389

141_128_e3_3e-5/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85cc917696accd53b538dcd9c87ae529f6a05b9367b010c3c6f3ff7ecb0e2770
+size 1401

141_128_e3_3e-5/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<reponame>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

141_128_e3_3e-5/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

141_128_e3_3e-5/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,188 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<reponame>",
+  "padding_side": "left",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

141_128_e3_3e-5/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2365 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 1665,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.009017132551848512,
+      "grad_norm": 0.9823748469352722,
+      "learning_rate": 1.4285714285714286e-06,
+      "loss": 1.4239,
+      "step": 5
+    },
+    {
+      "epoch": 0.018034265103697024,
+      "grad_norm": 0.7504017353057861,
+      "learning_rate": 3.2142857142857143e-06,
+      "loss": 1.4605,
+      "step": 10
+    },
+    {
+      "epoch": 0.027051397655545536,
+      "grad_norm": 0.5925265550613403,
+      "learning_rate": 4.9999999999999996e-06,
+      "loss": 1.4271,
+      "step": 15
+    },
+    {
+      "epoch": 0.03606853020739405,
+      "grad_norm": 0.5115848183631897,
+      "learning_rate": 6.785714285714286e-06,
+      "loss": 1.3737,
+      "step": 20
+    },
+    {
+      "epoch": 0.04508566275924256,
+      "grad_norm": 0.5237375497817993,
+      "learning_rate": 8.571428571428571e-06,
+      "loss": 1.4109,
+      "step": 25
+    },
+    {
+      "epoch": 0.05410279531109107,
+      "grad_norm": 0.5411755442619324,
+      "learning_rate": 1.0357142857142857e-05,
+      "loss": 1.3755,
+      "step": 30
+    },
+    {
+      "epoch": 0.06311992786293959,
+      "grad_norm": 0.552263617515564,
+      "learning_rate": 1.2142857142857144e-05,
+      "loss": 1.4193,
+      "step": 35
+    },
+    {
+      "epoch": 0.0721370604147881,
+      "grad_norm": 0.47678422927856445,
+      "learning_rate": 1.3928571428571429e-05,
+      "loss": 1.3528,
+      "step": 40
+    },
+    {
+      "epoch": 0.0811541929666366,
+      "grad_norm": 0.6749147772789001,
+      "learning_rate": 1.5714285714285715e-05,
+      "loss": 1.3333,
+      "step": 45
+    },
+    {
+      "epoch": 0.09017132551848513,
+      "grad_norm": 0.48501694202423096,
+      "learning_rate": 1.7500000000000002e-05,
+      "loss": 1.3818,
+      "step": 50
+    },
+    {
+      "epoch": 0.09918845807033363,
+      "grad_norm": 0.48375576734542847,
+      "learning_rate": 1.928571428571429e-05,
+      "loss": 1.2968,
+      "step": 55
+    },
+    {
+      "epoch": 0.10820559062218214,
+      "grad_norm": 0.5306784510612488,
+      "learning_rate": 2.107142857142857e-05,
+      "loss": 1.2767,
+      "step": 60
+    },
+    {
+      "epoch": 0.11722272317403065,
+      "grad_norm": 0.5145726203918457,
+      "learning_rate": 2.2857142857142858e-05,
+      "loss": 1.3462,
+      "step": 65
+    },
+    {
+      "epoch": 0.12623985572587917,
+      "grad_norm": 0.5507055521011353,
+      "learning_rate": 2.464285714285714e-05,
+      "loss": 1.3082,
+      "step": 70
+    },
+    {
+      "epoch": 0.13525698827772767,
+      "grad_norm": 0.6329906582832336,
+      "learning_rate": 2.6428571428571428e-05,
+      "loss": 1.303,
+      "step": 75
+    },
+    {
+      "epoch": 0.1442741208295762,
+      "grad_norm": 0.6399571895599365,
+      "learning_rate": 2.8214285714285714e-05,
+      "loss": 1.2617,
+      "step": 80
+    },
+    {
+      "epoch": 0.1532912533814247,
+      "grad_norm": 0.6082172393798828,
+      "learning_rate": 3e-05,
+      "loss": 1.2564,
+      "step": 85
+    },
+    {
+      "epoch": 0.1623083859332732,
+      "grad_norm": 0.6711664199829102,
+      "learning_rate": 2.9999259655754585e-05,
+      "loss": 1.2222,
+      "step": 90
+    },
+    {
+      "epoch": 0.17132551848512173,
+      "grad_norm": 0.5679822564125061,
+      "learning_rate": 2.9997038696099626e-05,
+      "loss": 1.2208,
+      "step": 95
+    },
+    {
+      "epoch": 0.18034265103697025,
+      "grad_norm": 0.8000366687774658,
+      "learning_rate": 2.9993337340271743e-05,
+      "loss": 1.1955,
+      "step": 100
+    },
+    {
+      "epoch": 0.18935978358881875,
+      "grad_norm": 0.7099812030792236,
+      "learning_rate": 2.9988155953641272e-05,
+      "loss": 1.1798,
+      "step": 105
+    },
+    {
+      "epoch": 0.19837691614066727,
+      "grad_norm": 0.6969692707061768,
+      "learning_rate": 2.998149504767618e-05,
+      "loss": 1.185,
+      "step": 110
+    },
+    {
+      "epoch": 0.2073940486925158,
+      "grad_norm": 0.6630687117576599,
+      "learning_rate": 2.9973355279891595e-05,
+      "loss": 1.1046,
+      "step": 115
+    },
+    {
+      "epoch": 0.2164111812443643,
+      "grad_norm": 0.7325212955474854,
+      "learning_rate": 2.996373745378487e-05,
+      "loss": 1.1462,
+      "step": 120
+    },
+    {
+      "epoch": 0.2254283137962128,
+      "grad_norm": 0.7677739858627319,
+      "learning_rate": 2.995264251875631e-05,
+      "loss": 1.141,
+      "step": 125
+    },
+    {
+      "epoch": 0.2344454463480613,
+      "grad_norm": 0.6918173432350159,
+      "learning_rate": 2.9940071570015415e-05,
+      "loss": 1.1321,
+      "step": 130
+    },
+    {
+      "epoch": 0.24346257889990983,
+      "grad_norm": 0.7929957509040833,
+      "learning_rate": 2.9926025848472798e-05,
+      "loss": 1.0879,
+      "step": 135
+    },
+    {
+      "epoch": 0.25247971145175835,
+      "grad_norm": 0.8735880255699158,
+      "learning_rate": 2.991050674061767e-05,
+      "loss": 1.0603,
+      "step": 140
+    },
+    {
+      "epoch": 0.26149684400360684,
+      "grad_norm": 0.816652774810791,
+      "learning_rate": 2.9893515778380997e-05,
+      "loss": 1.0656,
+      "step": 145
+    },
+    {
+      "epoch": 0.27051397655545534,
+      "grad_norm": 0.8197211027145386,
+      "learning_rate": 2.9875054638984253e-05,
+      "loss": 1.0459,
+      "step": 150
+    },
+    {
+      "epoch": 0.2795311091073039,
+      "grad_norm": 0.9397364854812622,
+      "learning_rate": 2.9855125144773885e-05,
+      "loss": 1.0727,
+      "step": 155
+    },
+    {
+      "epoch": 0.2885482416591524,
+      "grad_norm": 0.9460065364837646,
+      "learning_rate": 2.9833729263041407e-05,
+      "loss": 1.0281,
+      "step": 160
+    },
+    {
+      "epoch": 0.2975653742110009,
+      "grad_norm": 0.9303812980651855,
+      "learning_rate": 2.9810869105829202e-05,
+      "loss": 0.9843,
+      "step": 165
+    },
+    {
+      "epoch": 0.3065825067628494,
+      "grad_norm": 1.0594195127487183,
+      "learning_rate": 2.9786546929722055e-05,
+      "loss": 0.9947,
+      "step": 170
+    },
+    {
+      "epoch": 0.3155996393146979,
+      "grad_norm": 0.9684775471687317,
+      "learning_rate": 2.9760765135624387e-05,
+      "loss": 0.9467,
+      "step": 175
+    },
+    {
+      "epoch": 0.3246167718665464,
+      "grad_norm": 0.9406678676605225,
+      "learning_rate": 2.9733526268523238e-05,
+      "loss": 0.9575,
+      "step": 180
+    },
+    {
+      "epoch": 0.33363390441839497,
+      "grad_norm": 0.9979203939437866,
+      "learning_rate": 2.9704833017237077e-05,
+      "loss": 0.9751,
+      "step": 185
+    },
+    {
+      "epoch": 0.34265103697024346,
+      "grad_norm": 1.058048963546753,
+      "learning_rate": 2.967468821415038e-05,
+      "loss": 0.9804,
+      "step": 190
+    },
+    {
+      "epoch": 0.35166816952209196,
+      "grad_norm": 1.0112496614456177,
+      "learning_rate": 2.9643094834933997e-05,
+      "loss": 0.9342,
+      "step": 195
+    },
+    {
+      "epoch": 0.3606853020739405,
+      "grad_norm": 1.0593630075454712,
+      "learning_rate": 2.9610055998251473e-05,
+      "loss": 0.9349,
+      "step": 200
+    },
+    {
+      "epoch": 0.369702434625789,
+      "grad_norm": 0.9569016695022583,
+      "learning_rate": 2.9575574965451156e-05,
+      "loss": 0.9317,
+      "step": 205
+    },
+    {
+      "epoch": 0.3787195671776375,
+      "grad_norm": 1.0082752704620361,
+      "learning_rate": 2.9539655140244263e-05,
+      "loss": 0.9217,
+      "step": 210
+    },
+    {
+      "epoch": 0.38773669972948605,
+      "grad_norm": 1.0427634716033936,
+      "learning_rate": 2.9502300068368922e-05,
+      "loss": 0.8333,
+      "step": 215
+    },
+    {
+      "epoch": 0.39675383228133454,
+      "grad_norm": 1.2071142196655273,
+      "learning_rate": 2.946351343724013e-05,
+      "loss": 0.8514,
+      "step": 220
+    },
+    {
+      "epoch": 0.40577096483318303,
+      "grad_norm": 1.0921831130981445,
+      "learning_rate": 2.9423299075585775e-05,
+      "loss": 0.8822,
+      "step": 225
+    },
+    {
+      "epoch": 0.4147880973850316,
+      "grad_norm": 1.0517659187316895,
+      "learning_rate": 2.9381660953068686e-05,
+      "loss": 0.9019,
+      "step": 230
+    },
+    {
+      "epoch": 0.4238052299368801,
+      "grad_norm": 1.1370799541473389,
+      "learning_rate": 2.9338603179894784e-05,
+      "loss": 0.8457,
+      "step": 235
+    },
+    {
+      "epoch": 0.4328223624887286,
+      "grad_norm": 1.501523733139038,
+      "learning_rate": 2.929413000640735e-05,
+      "loss": 0.8285,
+      "step": 240
+    },
+    {
+      "epoch": 0.4418394950405771,
+      "grad_norm": 1.3357998132705688,
+      "learning_rate": 2.9248245822667457e-05,
+      "loss": 0.8026,
+      "step": 245
+    },
+    {
+      "epoch": 0.4508566275924256,
+      "grad_norm": 1.1207865476608276,
+      "learning_rate": 2.920095515802062e-05,
+      "loss": 0.8388,
+      "step": 250
+    },
+    {
+      "epoch": 0.4598737601442741,
+      "grad_norm": 1.2629461288452148,
+      "learning_rate": 2.9152262680649704e-05,
+      "loss": 0.7775,
+      "step": 255
+    },
+    {
+      "epoch": 0.4688908926961226,
+      "grad_norm": 1.1020766496658325,
+      "learning_rate": 2.9102173197114094e-05,
+      "loss": 0.7906,
+      "step": 260
+    },
+    {
+      "epoch": 0.47790802524797116,
+      "grad_norm": 1.5498449802398682,
+      "learning_rate": 2.9050691651875243e-05,
+      "loss": 0.8041,
+      "step": 265
+    },
+    {
+      "epoch": 0.48692515779981965,
+      "grad_norm": 1.1285768747329712,
+      "learning_rate": 2.8997823126808583e-05,
+      "loss": 0.7847,
+      "step": 270
+    },
+    {
+      "epoch": 0.49594229035166815,
+      "grad_norm": 1.1491355895996094,
+      "learning_rate": 2.894357284070189e-05,
+      "loss": 0.7573,
+      "step": 275
+    },
+    {
+      "epoch": 0.5049594229035167,
+      "grad_norm": 1.1342307329177856,
+      "learning_rate": 2.888794614874011e-05,
+      "loss": 0.7745,
+      "step": 280
+    },
+    {
+      "epoch": 0.5139765554553652,
+      "grad_norm": 1.1102683544158936,
+      "learning_rate": 2.883094854197676e-05,
+      "loss": 0.7843,
+      "step": 285
+    },
+    {
+      "epoch": 0.5229936880072137,
+      "grad_norm": 1.1436070203781128,
+      "learning_rate": 2.877258564679185e-05,
+      "loss": 0.8108,
+      "step": 290
+    },
+    {
+      "epoch": 0.5320108205590622,
+      "grad_norm": 1.3217586278915405,
+      "learning_rate": 2.8712863224336533e-05,
+      "loss": 0.7258,
+      "step": 295
+    },
+    {
+      "epoch": 0.5410279531109107,
+      "grad_norm": 1.2891480922698975,
+      "learning_rate": 2.8651787169964374e-05,
+      "loss": 0.7487,
+      "step": 300
+    },
+    {
+      "epoch": 0.5500450856627592,
+      "grad_norm": 1.217319130897522,
+      "learning_rate": 2.8589363512649432e-05,
+      "loss": 0.7301,
+      "step": 305
+    },
+    {
+      "epoch": 0.5590622182146078,
+      "grad_norm": 1.2876780033111572,
+      "learning_rate": 2.8525598414391104e-05,
+      "loss": 0.6748,
+      "step": 310
+    },
+    {
+      "epoch": 0.5680793507664562,
+      "grad_norm": 1.2525529861450195,
+      "learning_rate": 2.846049816960585e-05,
+      "loss": 0.6724,
+      "step": 315
+    },
+    {
+      "epoch": 0.5770964833183048,
+      "grad_norm": 1.319061517715454,
+      "learning_rate": 2.83940692045059e-05,
+      "loss": 0.713,
+      "step": 320
+    },
+    {
+      "epoch": 0.5861136158701533,
+      "grad_norm": 1.136137843132019,
+      "learning_rate": 2.8326318076464852e-05,
+      "loss": 0.6584,
+      "step": 325
+    },
+    {
+      "epoch": 0.5951307484220018,
+      "grad_norm": 1.529517650604248,
+      "learning_rate": 2.8257251473370408e-05,
+      "loss": 0.6824,
+      "step": 330
+    },
+    {
+      "epoch": 0.6041478809738503,
+      "grad_norm": 1.3196120262145996,
+      "learning_rate": 2.8186876212964185e-05,
+      "loss": 0.6721,
+      "step": 335
+    },
+    {
+      "epoch": 0.6131650135256989,
+      "grad_norm": 1.1170713901519775,
+      "learning_rate": 2.811519924216873e-05,
+      "loss": 0.6481,
+      "step": 340
+    },
+    {
+      "epoch": 0.6221821460775473,
+      "grad_norm": 1.2801036834716797,
+      "learning_rate": 2.8042227636401757e-05,
+      "loss": 0.6582,
+      "step": 345
+    },
+    {
+      "epoch": 0.6311992786293958,
+      "grad_norm": 1.3887863159179688,
+      "learning_rate": 2.796796859887772e-05,
+      "loss": 0.6751,
+      "step": 350
+    },
+    {
+      "epoch": 0.6402164111812444,
+      "grad_norm": 1.3034902811050415,
+      "learning_rate": 2.7892429459896766e-05,
+      "loss": 0.7017,
+      "step": 355
+    },
+    {
+      "epoch": 0.6492335437330928,
+      "grad_norm": 1.2114206552505493,
+      "learning_rate": 2.7815617676121138e-05,
+      "loss": 0.6343,
+      "step": 360
+    },
+    {
+      "epoch": 0.6582506762849414,
+      "grad_norm": 1.3389161825180054,
+      "learning_rate": 2.773754082983912e-05,
+      "loss": 0.6524,
+      "step": 365
+    },
+    {
+      "epoch": 0.6672678088367899,
+      "grad_norm": 1.3806394338607788,
+      "learning_rate": 2.7658206628216556e-05,
+      "loss": 0.6147,
+      "step": 370
+    },
+    {
+      "epoch": 0.6762849413886384,
+      "grad_norm": 1.450261116027832,
+      "learning_rate": 2.7577622902536064e-05,
+      "loss": 0.6155,
+      "step": 375
+    },
+    {
+      "epoch": 0.6853020739404869,
+      "grad_norm": 1.3945132493972778,
+      "learning_rate": 2.7495797607423986e-05,
+      "loss": 0.6374,
+      "step": 380
+    },
+    {
+      "epoch": 0.6943192064923355,
+      "grad_norm": 1.341496467590332,
+      "learning_rate": 2.7412738820065173e-05,
+      "loss": 0.5744,
+      "step": 385
+    },
+    {
+      "epoch": 0.7033363390441839,
+      "grad_norm": 1.3878300189971924,
+      "learning_rate": 2.732845473940566e-05,
+      "loss": 0.6135,
+      "step": 390
+    },
+    {
+      "epoch": 0.7123534715960325,
+      "grad_norm": 1.4109869003295898,
+      "learning_rate": 2.7242953685343327e-05,
+      "loss": 0.5897,
+      "step": 395
+    },
+    {
+      "epoch": 0.721370604147881,
+      "grad_norm": 1.2432441711425781,
+      "learning_rate": 2.7156244097906614e-05,
+      "loss": 0.5635,
+      "step": 400
+    },
+    {
+      "epoch": 0.7303877366997295,
+      "grad_norm": 1.3838210105895996,
+      "learning_rate": 2.7068334536421408e-05,
+      "loss": 0.564,
+      "step": 405
+    },
+    {
+      "epoch": 0.739404869251578,
+      "grad_norm": 1.266007661819458,
+      "learning_rate": 2.6979233678666102e-05,
+      "loss": 0.5788,
+      "step": 410
+    },
+    {
+      "epoch": 0.7484220018034266,
+      "grad_norm": 1.417275309562683,
+      "learning_rate": 2.6888950320014993e-05,
+      "loss": 0.5396,
+      "step": 415
+    },
+    {
+      "epoch": 0.757439134355275,
+      "grad_norm": 1.2462772130966187,
+      "learning_rate": 2.6797493372570098e-05,
+      "loss": 0.5767,
+      "step": 420
+    },
+    {
+      "epoch": 0.7664562669071235,
+      "grad_norm": 1.5275555849075317,
+      "learning_rate": 2.6704871864281377e-05,
+      "loss": 0.5235,
+      "step": 425
+    },
+    {
+      "epoch": 0.7754733994589721,
+      "grad_norm": 1.2584601640701294,
+      "learning_rate": 2.6611094938055586e-05,
+      "loss": 0.5419,
+      "step": 430
+    },
+    {
+      "epoch": 0.7844905320108205,
+      "grad_norm": 1.4714555740356445,
+      "learning_rate": 2.651617185085375e-05,
+      "loss": 0.5435,
+      "step": 435
+    },
+    {
+      "epoch": 0.7935076645626691,
+      "grad_norm": 1.332058310508728,
+      "learning_rate": 2.642011197277738e-05,
+      "loss": 0.5018,
+      "step": 440
+    },
+    {
+      "epoch": 0.8025247971145176,
+      "grad_norm": 1.3258447647094727,
+      "learning_rate": 2.6322924786143544e-05,
+      "loss": 0.5446,
+      "step": 445
+    },
+    {
+      "epoch": 0.8115419296663661,
+      "grad_norm": 1.1995099782943726,
+      "learning_rate": 2.6224619884548814e-05,
+      "loss": 0.5086,
+      "step": 450
+    },
+    {
+      "epoch": 0.8205590622182146,
+      "grad_norm": 1.2953500747680664,
+      "learning_rate": 2.612520697192229e-05,
+      "loss": 0.5596,
+      "step": 455
+    },
+    {
+      "epoch": 0.8295761947700632,
+      "grad_norm": 1.9276663064956665,
+      "learning_rate": 2.6024695861567675e-05,
+      "loss": 0.536,
+      "step": 460
+    },
+    {
+      "epoch": 0.8385933273219116,
+      "grad_norm": 1.7397925853729248,
+      "learning_rate": 2.592309647519458e-05,
+      "loss": 0.5438,
+      "step": 465
+    },
+    {
+      "epoch": 0.8476104598737602,
+      "grad_norm": 1.4937227964401245,
+      "learning_rate": 2.5820418841939152e-05,
+      "loss": 0.4989,
+      "step": 470
+    },
+    {
+      "epoch": 0.8566275924256087,
+      "grad_norm": 1.4302619695663452,
+      "learning_rate": 2.5716673097374047e-05,
+      "loss": 0.5365,
+      "step": 475
+    },
+    {
+      "epoch": 0.8656447249774571,
+      "grad_norm": 1.3358851671218872,
+      "learning_rate": 2.5611869482507924e-05,
+      "loss": 0.5371,
+      "step": 480
+    },
+    {
+      "epoch": 0.8746618575293057,
+      "grad_norm": 1.4897438287734985,
+      "learning_rate": 2.550601834277454e-05,
+      "loss": 0.4802,
+      "step": 485
+    },
+    {
+      "epoch": 0.8836789900811542,
+      "grad_norm": 1.492023229598999,
+      "learning_rate": 2.539913012701152e-05,
+      "loss": 0.4706,
+      "step": 490
+    },
+    {
+      "epoch": 0.8926961226330027,
+      "grad_norm": 1.3626470565795898,
+      "learning_rate": 2.529121538642892e-05,
+      "loss": 0.478,
+      "step": 495
+    },
+    {
+      "epoch": 0.9017132551848512,
+      "grad_norm": 1.3528685569763184,
+      "learning_rate": 2.51822847735677e-05,
+      "loss": 0.4751,
+      "step": 500
+    },
+    {
+      "epoch": 0.9107303877366997,
+      "grad_norm": 1.4771296977996826,
+      "learning_rate": 2.5072349041248175e-05,
+      "loss": 0.4557,
+      "step": 505
+    },
+    {
+      "epoch": 0.9197475202885482,
+      "grad_norm": 1.2409591674804688,
+      "learning_rate": 2.496141904150859e-05,
+      "loss": 0.4721,
+      "step": 510
+    },
+    {
+      "epoch": 0.9287646528403968,
+      "grad_norm": 1.4840649366378784,
+      "learning_rate": 2.484950572453386e-05,
+      "loss": 0.4187,
+      "step": 515
+    },
+    {
+      "epoch": 0.9377817853922452,
+      "grad_norm": 1.2992953062057495,
+      "learning_rate": 2.4736620137574686e-05,
+      "loss": 0.4753,
+      "step": 520
+    },
+    {
+      "epoch": 0.9467989179440938,
+      "grad_norm": 1.5280839204788208,
+      "learning_rate": 2.4622773423857032e-05,
+      "loss": 0.4265,
+      "step": 525
+    },
+    {
+      "epoch": 0.9558160504959423,
+      "grad_norm": 1.377023696899414,
+      "learning_rate": 2.4507976821482138e-05,
+      "loss": 0.429,
+      "step": 530
+    },
+    {
+      "epoch": 0.9648331830477908,
+      "grad_norm": 1.4453728199005127,
+      "learning_rate": 2.4392241662317205e-05,
+      "loss": 0.4469,
+      "step": 535
+    },
+    {
+      "epoch": 0.9738503155996393,
+      "grad_norm": 1.395015001296997,
+      "learning_rate": 2.4275579370876772e-05,
+      "loss": 0.405,
+      "step": 540
+    },
+    {
+      "epoch": 0.9828674481514879,
+      "grad_norm": 1.4183626174926758,
+      "learning_rate": 2.4158001463194998e-05,
+      "loss": 0.4253,
+      "step": 545
+    },
+    {
+      "epoch": 0.9918845807033363,
+      "grad_norm": 1.4114842414855957,
+      "learning_rate": 2.4039519545688848e-05,
+      "loss": 0.4298,
+      "step": 550
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.9220740795135498,
+      "learning_rate": 2.392014531401244e-05,
+      "loss": 0.3982,
+      "step": 555
+    },
+    {
+      "epoch": 1.0090171325518484,
+      "grad_norm": 1.6756243705749512,
+      "learning_rate": 2.37998905519025e-05,
+      "loss": 0.3248,
+      "step": 560
+    },
+    {
+      "epoch": 1.018034265103697,
+      "grad_norm": 1.4207195043563843,
+      "learning_rate": 2.3678767130015174e-05,
+      "loss": 0.3696,
+      "step": 565
+    },
+    {
+      "epoch": 1.0270513976555455,
+      "grad_norm": 1.3674201965332031,
+      "learning_rate": 2.3556787004754253e-05,
+      "loss": 0.3699,
+      "step": 570
+    },
+    {
+      "epoch": 1.036068530207394,
+      "grad_norm": 1.354906439781189,
+      "learning_rate": 2.3433962217090904e-05,
+      "loss": 0.3395,
+      "step": 575
+    },
+    {
+      "epoch": 1.0450856627592426,
+      "grad_norm": 1.4456382989883423,
+      "learning_rate": 2.3310304891375092e-05,
+      "loss": 0.3367,
+      "step": 580
+    },
+    {
+      "epoch": 1.054102795311091,
+      "grad_norm": 1.4264848232269287,
+      "learning_rate": 2.3185827234138756e-05,
+      "loss": 0.3615,
+      "step": 585
+    },
+    {
+      "epoch": 1.0631199278629395,
+      "grad_norm": 1.5015490055084229,
+      "learning_rate": 2.306054153289085e-05,
+      "loss": 0.326,
+      "step": 590
+    },
+    {
+      "epoch": 1.0721370604147882,
+      "grad_norm": 1.2058688402175903,
+      "learning_rate": 2.2934460154904436e-05,
+      "loss": 0.3253,
+      "step": 595
+    },
+    {
+      "epoch": 1.0811541929666366,
+      "grad_norm": 1.3843647241592407,
+      "learning_rate": 2.280759554599587e-05,
+      "loss": 0.3636,
+      "step": 600
+    },
+    {
+      "epoch": 1.090171325518485,
+      "grad_norm": 1.411778211593628,
+      "learning_rate": 2.2679960229296244e-05,
+      "loss": 0.3349,
+      "step": 605
+    },
+    {
+      "epoch": 1.0991884580703337,
+      "grad_norm": 1.3525984287261963,
+      "learning_rate": 2.255156680401518e-05,
+      "loss": 0.3903,
+      "step": 610
+    },
+    {
+      "epoch": 1.1082055906221822,
+      "grad_norm": 1.51930832862854,
+      "learning_rate": 2.242242794419715e-05,
+      "loss": 0.3149,
+      "step": 615
+    },
+    {
+      "epoch": 1.1172227231740306,
+      "grad_norm": 1.362392783164978,
+      "learning_rate": 2.2292556397470394e-05,
+      "loss": 0.3107,
+      "step": 620
+    },
+    {
+      "epoch": 1.1262398557258793,
+      "grad_norm": 1.4513483047485352,
+      "learning_rate": 2.2161964983788535e-05,
+      "loss": 0.3219,
+      "step": 625
+    },
+    {
+      "epoch": 1.1352569882777277,
+      "grad_norm": 1.2779744863510132,
+      "learning_rate": 2.2030666594165135e-05,
+      "loss": 0.3019,
+      "step": 630
+    },
+    {
+      "epoch": 1.1442741208295761,
+      "grad_norm": 1.532083511352539,
+      "learning_rate": 2.1898674189401148e-05,
+      "loss": 0.3372,
+      "step": 635
+    },
+    {
+      "epoch": 1.1532912533814248,
+      "grad_norm": 1.2743816375732422,
+      "learning_rate": 2.1766000798805542e-05,
+      "loss": 0.3168,
+      "step": 640
+    },
+    {
+      "epoch": 1.1623083859332732,
+      "grad_norm": 1.5035699605941772,
+      "learning_rate": 2.1632659518909156e-05,
+      "loss": 0.2981,
+      "step": 645
+    },
+    {
+      "epoch": 1.1713255184851217,
+      "grad_norm": 1.286419153213501,
+      "learning_rate": 2.1498663512171885e-05,
+      "loss": 0.3447,
+      "step": 650
+    },
+    {
+      "epoch": 1.1803426510369703,
+      "grad_norm": 1.2940813302993774,
+      "learning_rate": 2.13640260056834e-05,
+      "loss": 0.3169,
+      "step": 655
+    },
+    {
+      "epoch": 1.1893597835888188,
+      "grad_norm": 1.3351128101348877,
+      "learning_rate": 2.1228760289857456e-05,
+      "loss": 0.3133,
+      "step": 660
+    },
+    {
+      "epoch": 1.1983769161406672,
+      "grad_norm": 1.4152436256408691,
+      "learning_rate": 2.1092879717119955e-05,
+      "loss": 0.2943,
+      "step": 665
+    },
+    {
+      "epoch": 1.2073940486925159,
+      "grad_norm": 1.4584360122680664,
+      "learning_rate": 2.0956397700590915e-05,
+      "loss": 0.2995,
+      "step": 670
+    },
+    {
+      "epoch": 1.2164111812443643,
+      "grad_norm": 1.2866615056991577,
+      "learning_rate": 2.0819327712760396e-05,
+      "loss": 0.2878,
+      "step": 675
+    },
+    {
+      "epoch": 1.2254283137962128,
+      "grad_norm": 1.488740086555481,
+      "learning_rate": 2.068168328415864e-05,
+      "loss": 0.3024,
+      "step": 680
+    },
+    {
+      "epoch": 1.2344454463480612,
+      "grad_norm": 1.3610576391220093,
+      "learning_rate": 2.054347800202039e-05,
+      "loss": 0.2683,
+      "step": 685
+    },
+    {
+      "epoch": 1.2434625788999099,
+      "grad_norm": 1.2630383968353271,
+      "learning_rate": 2.0404725508943693e-05,
+      "loss": 0.3304,
+      "step": 690
+    },
+    {
+      "epoch": 1.2524797114517583,
+      "grad_norm": 1.224249243736267,
+      "learning_rate": 2.0265439501543184e-05,
+      "loss": 0.2823,
+      "step": 695
+    },
+    {
+      "epoch": 1.2614968440036067,
+      "grad_norm": 1.3269656896591187,
+      "learning_rate": 2.012563372909807e-05,
+      "loss": 0.2989,
+      "step": 700
+    },
+    {
+      "epoch": 1.2705139765554554,
+      "grad_norm": 1.3828004598617554,
+      "learning_rate": 1.9985321992194896e-05,
+      "loss": 0.2456,
+      "step": 705
+    },
+    {
+      "epoch": 1.2795311091073038,
+      "grad_norm": 1.3054155111312866,
+      "learning_rate": 1.984451814136526e-05,
+      "loss": 0.2916,
+      "step": 710
+    },
+    {
+      "epoch": 1.2885482416591523,
+      "grad_norm": 1.3246440887451172,
+      "learning_rate": 1.970323607571859e-05,
+      "loss": 0.2625,
+      "step": 715
+    },
+    {
+      "epoch": 1.297565374211001,
+      "grad_norm": 1.428695559501648,
+      "learning_rate": 1.956148974157012e-05,
+      "loss": 0.2887,
+      "step": 720
+    },
+    {
+      "epoch": 1.3065825067628494,
+      "grad_norm": 1.4095420837402344,
+      "learning_rate": 1.9419293131064237e-05,
+      "loss": 0.2554,
+      "step": 725
+    },
+    {
+      "epoch": 1.3155996393146978,
+      "grad_norm": 1.4944108724594116,
+      "learning_rate": 1.9276660280793223e-05,
+      "loss": 0.2312,
+      "step": 730
+    },
+    {
+      "epoch": 1.3246167718665465,
+      "grad_norm": 1.2393672466278076,
+      "learning_rate": 1.9133605270411748e-05,
+      "loss": 0.3093,
+      "step": 735
+    },
+    {
+      "epoch": 1.333633904418395,
+      "grad_norm": 1.2808177471160889,
+      "learning_rate": 1.899014222124698e-05,
+      "loss": 0.2566,
+      "step": 740
+    },
+    {
+      "epoch": 1.3426510369702434,
+      "grad_norm": 1.405255675315857,
+      "learning_rate": 1.8846285294904616e-05,
+      "loss": 0.2433,
+      "step": 745
+    },
+    {
+      "epoch": 1.351668169522092,
+      "grad_norm": 1.5428978204727173,
+      "learning_rate": 1.8702048691871026e-05,
+      "loss": 0.2547,
+      "step": 750
+    },
+    {
+      "epoch": 1.3606853020739405,
+      "grad_norm": 1.457382321357727,
+      "learning_rate": 1.855744665011139e-05,
+      "loss": 0.2684,
+      "step": 755
+    },
+    {
+      "epoch": 1.369702434625789,
+      "grad_norm": 1.3935832977294922,
+      "learning_rate": 1.8412493443664316e-05,
+      "loss": 0.2698,
+      "step": 760
+    },
+    {
+      "epoch": 1.3787195671776376,
+      "grad_norm": 1.2251616716384888,
+      "learning_rate": 1.8267203381232774e-05,
+      "loss": 0.2378,
+      "step": 765
+    },
+    {
+      "epoch": 1.387736699729486,
+      "grad_norm": 1.483079195022583,
+      "learning_rate": 1.812159080477165e-05,
+      "loss": 0.251,
+      "step": 770
+    },
+    {
+      "epoch": 1.3967538322813344,
+      "grad_norm": 1.3615831136703491,
+      "learning_rate": 1.7975670088072e-05,
+      "loss": 0.253,
+      "step": 775
+    },
+    {
+      "epoch": 1.405770964833183,
+      "grad_norm": 1.2965681552886963,
+      "learning_rate": 1.7829455635342242e-05,
+      "loss": 0.2289,
+      "step": 780
+    },
+    {
+      "epoch": 1.4147880973850315,
+      "grad_norm": 1.2582015991210938,
+      "learning_rate": 1.7682961879786166e-05,
+      "loss": 0.2536,
+      "step": 785
+    },
+    {
+      "epoch": 1.42380522993688,
+      "grad_norm": 1.4277766942977905,
+      "learning_rate": 1.7536203282178315e-05,
+      "loss": 0.213,
+      "step": 790
+    },
+    {
+      "epoch": 1.4328223624887286,
+      "grad_norm": 1.32048499584198,
+      "learning_rate": 1.7389194329436436e-05,
+      "loss": 0.2354,
+      "step": 795
+    },
+    {
+      "epoch": 1.441839495040577,
+      "grad_norm": 1.4123876094818115,
+      "learning_rate": 1.72419495331915e-05,
+      "loss": 0.2404,
+      "step": 800
+    },
+    {
+      "epoch": 1.4508566275924255,
+      "grad_norm": 1.4886255264282227,
+      "learning_rate": 1.7094483428355177e-05,
+      "loss": 0.2578,
+      "step": 805
+    },
+    {
+      "epoch": 1.4598737601442742,
+      "grad_norm": 1.2853409051895142,
+      "learning_rate": 1.694681057168508e-05,
+      "loss": 0.2424,
+      "step": 810
+    },
+    {
+      "epoch": 1.4688908926961226,
+      "grad_norm": 1.1790263652801514,
+      "learning_rate": 1.6798945540347822e-05,
+      "loss": 0.2035,
+      "step": 815
+    },
+    {
+      "epoch": 1.477908025247971,
+      "grad_norm": 1.4339054822921753,
+      "learning_rate": 1.665090293048009e-05,
+      "loss": 0.2246,
+      "step": 820
+    },
+    {
+      "epoch": 1.4869251577998197,
+      "grad_norm": 1.3507592678070068,
+      "learning_rate": 1.6502697355747775e-05,
+      "loss": 0.1983,
+      "step": 825
+    },
+    {
+      "epoch": 1.4959422903516681,
+      "grad_norm": 1.3443710803985596,
+      "learning_rate": 1.635434344590348e-05,
+      "loss": 0.1907,
+      "step": 830
+    },
+    {
+      "epoch": 1.5049594229035166,
+      "grad_norm": 1.3209919929504395,
+      "learning_rate": 1.6205855845342314e-05,
+      "loss": 0.2252,
+      "step": 835
+    },
+    {
+      "epoch": 1.5139765554553652,
+      "grad_norm": 1.2697038650512695,
+      "learning_rate": 1.6057249211656363e-05,
+      "loss": 0.219,
+      "step": 840
+    },
+    {
+      "epoch": 1.5229936880072137,
+      "grad_norm": 1.309358835220337,
+      "learning_rate": 1.5908538214187767e-05,
+      "loss": 0.2227,
+      "step": 845
+    },
+    {
+      "epoch": 1.5320108205590621,
+      "grad_norm": 1.342112421989441,
+      "learning_rate": 1.5759737532580692e-05,
+      "loss": 0.2037,
+      "step": 850
+    },
+    {
+      "epoch": 1.5410279531109108,
+      "grad_norm": 1.3193739652633667,
+      "learning_rate": 1.5610861855332244e-05,
+      "loss": 0.2037,
+      "step": 855
+    },
+    {
+      "epoch": 1.5500450856627592,
+      "grad_norm": 1.2694180011749268,
+      "learning_rate": 1.5461925878342558e-05,
+      "loss": 0.194,
+      "step": 860
+    },
+    {
+      "epoch": 1.5590622182146077,
+      "grad_norm": 1.4624671936035156,
+      "learning_rate": 1.531294430346409e-05,
+      "loss": 0.2328,
+      "step": 865
+    },
+    {
+      "epoch": 1.5680793507664563,
+      "grad_norm": 1.3163915872573853,
+      "learning_rate": 1.5163931837050395e-05,
+      "loss": 0.2015,
+      "step": 870
+    },
+    {
+      "epoch": 1.5770964833183048,
+      "grad_norm": 1.1455012559890747,
+      "learning_rate": 1.5014903188504401e-05,
+      "loss": 0.2009,
+      "step": 875
+    },
+    {
+      "epoch": 1.5861136158701532,
+      "grad_norm": 1.2366622686386108,
+      "learning_rate": 1.4865873068826437e-05,
+      "loss": 0.2112,
+      "step": 880
+    },
+    {
+      "epoch": 1.5951307484220019,
+      "grad_norm": 1.3390120267868042,
+      "learning_rate": 1.4716856189162018e-05,
+      "loss": 0.2036,
+      "step": 885
+    },
+    {
+      "epoch": 1.6041478809738503,
+      "grad_norm": 1.5190117359161377,
+      "learning_rate": 1.4567867259349735e-05,
+      "loss": 0.203,
+      "step": 890
+    },
+    {
+      "epoch": 1.6131650135256987,
+      "grad_norm": 1.283329725265503,
+      "learning_rate": 1.4418920986469153e-05,
+      "loss": 0.1989,
+      "step": 895
+    },
+    {
+      "epoch": 1.6221821460775474,
+      "grad_norm": 1.073015570640564,
+      "learning_rate": 1.427003207338908e-05,
+      "loss": 0.2006,
+      "step": 900
+    },
+    {
+      "epoch": 1.6311992786293958,
+      "grad_norm": 1.2181464433670044,
+      "learning_rate": 1.412121521731618e-05,
+      "loss": 0.2058,
+      "step": 905
+    },
+    {
+      "epoch": 1.6402164111812443,
+      "grad_norm": 1.3389447927474976,
+      "learning_rate": 1.397248510834419e-05,
+      "loss": 0.1821,
+      "step": 910
+    },
+    {
+      "epoch": 1.649233543733093,
+      "grad_norm": 1.2447868585586548,
+      "learning_rate": 1.3823856428003813e-05,
+      "loss": 0.1993,
+      "step": 915
+    },
+    {
+      "epoch": 1.6582506762849414,
+      "grad_norm": 1.231811285018921,
+      "learning_rate": 1.367534384781348e-05,
+      "loss": 0.1918,
+      "step": 920
+    },
+    {
+      "epoch": 1.6672678088367898,
+      "grad_norm": 1.3336375951766968,
+      "learning_rate": 1.3526962027831074e-05,
+      "loss": 0.2111,
+      "step": 925
+    },
+    {
+      "epoch": 1.6762849413886385,
+      "grad_norm": 1.4158216714859009,
+      "learning_rate": 1.3378725615206794e-05,
+      "loss": 0.1831,
+      "step": 930
+    },
+    {
+      "epoch": 1.685302073940487,
+      "grad_norm": 1.5446593761444092,
+      "learning_rate": 1.3230649242737312e-05,
+      "loss": 0.1737,
+      "step": 935
+    },
+    {
+      "epoch": 1.6943192064923354,
+      "grad_norm": 1.2693215608596802,
+      "learning_rate": 1.3082747527421336e-05,
+      "loss": 0.1835,
+      "step": 940
+    },
+    {
+      "epoch": 1.703336339044184,
+      "grad_norm": 1.31221342086792,
+      "learning_rate": 1.29350350690167e-05,
+      "loss": 0.1795,
+      "step": 945
+    },
+    {
+      "epoch": 1.7123534715960325,
+      "grad_norm": 1.1850473880767822,
+      "learning_rate": 1.2787526448599218e-05,
+      "loss": 0.1634,
+      "step": 950
+    },
+    {
+      "epoch": 1.721370604147881,
+      "grad_norm": 1.5322030782699585,
+      "learning_rate": 1.2640236227123321e-05,
+      "loss": 0.1863,
+      "step": 955
+    },
+    {
+      "epoch": 1.7303877366997296,
+      "grad_norm": 1.3057621717453003,
+      "learning_rate": 1.2493178943984734e-05,
+      "loss": 0.1779,
+      "step": 960
+    },
+    {
+      "epoch": 1.739404869251578,
+      "grad_norm": 1.0997627973556519,
+      "learning_rate": 1.234636911558522e-05,
+      "loss": 0.1759,
+      "step": 965
+    },
+    {
+      "epoch": 1.7484220018034264,
+      "grad_norm": 1.3787986040115356,
+      "learning_rate": 1.2199821233899677e-05,
+      "loss": 0.1485,
+      "step": 970
+    },
+    {
+      "epoch": 1.757439134355275,
+      "grad_norm": 1.1948957443237305,
+      "learning_rate": 1.2053549765045543e-05,
+      "loss": 0.1704,
+      "step": 975
+    },
+    {
+      "epoch": 1.7664562669071235,
+      "grad_norm": 1.4918591976165771,
+      "learning_rate": 1.1907569147854864e-05,
+      "loss": 0.1965,
+      "step": 980
+    },
+    {
+      "epoch": 1.775473399458972,
+      "grad_norm": 1.1869559288024902,
+      "learning_rate": 1.1761893792448944e-05,
+      "loss": 0.182,
+      "step": 985
+    },
+    {
+      "epoch": 1.7844905320108206,
+      "grad_norm": 1.3466490507125854,
+      "learning_rate": 1.161653807881593e-05,
+      "loss": 0.1902,
+      "step": 990
+    },
+    {
+      "epoch": 1.793507664562669,
+      "grad_norm": 1.178686261177063,
+      "learning_rate": 1.1471516355391302e-05,
+      "loss": 0.1502,
+      "step": 995
+    },
+    {
+      "epoch": 1.8025247971145175,
+      "grad_norm": 1.3178352117538452,
+      "learning_rate": 1.1326842937641523e-05,
+      "loss": 0.1682,
+      "step": 1000
+    },
+    {
+      "epoch": 1.8115419296663662,
+      "grad_norm": 1.1503818035125732,
+      "learning_rate": 1.1182532106650887e-05,
+      "loss": 0.1724,
+      "step": 1005
+    },
+    {
+      "epoch": 1.8205590622182146,
+      "grad_norm": 1.2390116453170776,
+      "learning_rate": 1.1038598107711841e-05,
+      "loss": 0.1563,
+      "step": 1010
+    },
+    {
+      "epoch": 1.829576194770063,
+      "grad_norm": 1.200169324874878,
+      "learning_rate": 1.0895055148918758e-05,
+      "loss": 0.1476,
+      "step": 1015
+    },
+    {
+      "epoch": 1.8385933273219117,
+      "grad_norm": 1.2306461334228516,
+      "learning_rate": 1.075191739976544e-05,
+      "loss": 0.1582,
+      "step": 1020
+    },
+    {
+      "epoch": 1.8476104598737602,
+      "grad_norm": 1.3552227020263672,
+      "learning_rate": 1.0609198989746403e-05,
+      "loss": 0.1479,
+      "step": 1025
+    },
+    {
+      "epoch": 1.8566275924256086,
+      "grad_norm": 1.1709760427474976,
+      "learning_rate": 1.046691400696213e-05,
+      "loss": 0.139,
+      "step": 1030
+    },
+    {
+      "epoch": 1.8656447249774573,
+      "grad_norm": 1.1519287824630737,
+      "learning_rate": 1.032507649672838e-05,
+      "loss": 0.1733,
+      "step": 1035
+    },
+    {
+      "epoch": 1.8746618575293057,
+      "grad_norm": 1.2188918590545654,
+      "learning_rate": 1.0183700460189745e-05,
+      "loss": 0.1617,
+      "step": 1040
+    },
+    {
+      "epoch": 1.8836789900811541,
+      "grad_norm": 1.3665494918823242,
+      "learning_rate": 1.0042799852937579e-05,
+      "loss": 0.1646,
+      "step": 1045
+    },
+    {
+      "epoch": 1.8926961226330028,
+      "grad_norm": 1.3414051532745361,
+      "learning_rate": 9.902388583632376e-06,
+      "loss": 0.1454,
+      "step": 1050
+    },
+    {
+      "epoch": 1.9017132551848512,
+      "grad_norm": 1.2638881206512451,
+      "learning_rate": 9.762480512630832e-06,
+      "loss": 0.1477,
+      "step": 1055
+    },
+    {
+      "epoch": 1.9107303877366997,
+      "grad_norm": 1.1304572820663452,
+      "learning_rate": 9.623089450617656e-06,
+      "loss": 0.1306,
+      "step": 1060
+    },
+    {
+      "epoch": 1.9197475202885483,
+      "grad_norm": 1.160897970199585,
+      "learning_rate": 9.484229157242254e-06,
+      "loss": 0.1446,
+      "step": 1065
+    },
+    {
+      "epoch": 1.9287646528403968,
+      "grad_norm": 1.1717419624328613,
+      "learning_rate": 9.345913339760514e-06,
+      "loss": 0.1437,
+      "step": 1070
+    },
+    {
+      "epoch": 1.9377817853922452,
+      "grad_norm": 1.086374044418335,
+      "learning_rate": 9.208155651681703e-06,
+      "loss": 0.13,
+      "step": 1075
+    },
+    {
+      "epoch": 1.9467989179440939,
+      "grad_norm": 1.4221737384796143,
+      "learning_rate": 9.070969691420711e-06,
+      "loss": 0.153,
+      "step": 1080
+    },
+    {
+      "epoch": 1.9558160504959423,
+      "grad_norm": 1.0908355712890625,
+      "learning_rate": 8.934369000955693e-06,
+      "loss": 0.1423,
+      "step": 1085
+    },
+    {
+      "epoch": 1.9648331830477908,
+      "grad_norm": 1.0857588052749634,
+      "learning_rate": 8.798367064491339e-06,
+      "loss": 0.1413,
+      "step": 1090
+    },
+    {
+      "epoch": 1.9738503155996394,
+      "grad_norm": 1.189750075340271,
+      "learning_rate": 8.66297730712778e-06,
+      "loss": 0.1441,
+      "step": 1095
+    },
+    {
+      "epoch": 1.9828674481514879,
+      "grad_norm": 1.389917016029358,
+      "learning_rate": 8.528213093535388e-06,
+      "loss": 0.1467,
+      "step": 1100
+    },
+    {
+      "epoch": 1.9918845807033363,
+      "grad_norm": 1.1500518321990967,
+      "learning_rate": 8.394087726635483e-06,
+      "loss": 0.1179,
+      "step": 1105
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 1.471095323562622,
+      "learning_rate": 8.260614446287227e-06,
+      "loss": 0.1206,
+      "step": 1110
+    },
+    {
+      "epoch": 2.0090171325518487,
+      "grad_norm": 1.3669251203536987,
+      "learning_rate": 8.127806427980613e-06,
+      "loss": 0.108,
+      "step": 1115
+    },
+    {
+      "epoch": 2.018034265103697,
+      "grad_norm": 0.9775254726409912,
+      "learning_rate": 7.995676781535921e-06,
+      "loss": 0.1095,
+      "step": 1120
+    },
+    {
+      "epoch": 2.0270513976555455,
+      "grad_norm": 1.0232077836990356,
+      "learning_rate": 7.864238549809602e-06,
+      "loss": 0.0968,
+      "step": 1125
+    },
+    {
+      "epoch": 2.036068530207394,
+      "grad_norm": 0.9106571078300476,
+      "learning_rate": 7.733504707406782e-06,
+      "loss": 0.1148,
+      "step": 1130
+    },
+    {
+      "epoch": 2.0450856627592424,
+      "grad_norm": 1.0568925142288208,
+      "learning_rate": 7.6034881594005314e-06,
+      "loss": 0.1066,
+      "step": 1135
+    },
+    {
+      "epoch": 2.054102795311091,
+      "grad_norm": 1.1241035461425781,
+      "learning_rate": 7.4742017400579204e-06,
+      "loss": 0.1032,
+      "step": 1140
+    },
+    {
+      "epoch": 2.0631199278629397,
+      "grad_norm": 1.1937174797058105,
+      "learning_rate": 7.345658211573163e-06,
+      "loss": 0.1131,
+      "step": 1145
+    },
+    {
+      "epoch": 2.072137060414788,
+      "grad_norm": 1.0716335773468018,
+      "learning_rate": 7.217870262807805e-06,
+      "loss": 0.1048,
+      "step": 1150
+    },
+    {
+      "epoch": 2.0811541929666366,
+      "grad_norm": 1.1201080083847046,
+      "learning_rate": 7.090850508038179e-06,
+      "loss": 0.106,
+      "step": 1155
+    },
+    {
+      "epoch": 2.0901713255184853,
+      "grad_norm": 1.0836949348449707,
+      "learning_rate": 6.96461148571022e-06,
+      "loss": 0.114,
+      "step": 1160
+    },
+    {
+      "epoch": 2.0991884580703335,
+      "grad_norm": 1.2321054935455322,
+      "learning_rate": 6.839165657201749e-06,
+      "loss": 0.1073,
+      "step": 1165
+    },
+    {
+      "epoch": 2.108205590622182,
+      "grad_norm": 0.8314303755760193,
+      "learning_rate": 6.7145254055924136e-06,
+      "loss": 0.0854,
+      "step": 1170
+    },
+    {
+      "epoch": 2.117222723174031,
+      "grad_norm": 1.0407764911651611,
+      "learning_rate": 6.5907030344412746e-06,
+      "loss": 0.1122,
+      "step": 1175
+    },
+    {
+      "epoch": 2.126239855725879,
+      "grad_norm": 0.9916133880615234,
+      "learning_rate": 6.467710766572329e-06,
+      "loss": 0.1098,
+      "step": 1180
+    },
+    {
+      "epoch": 2.1352569882777277,
+      "grad_norm": 1.0615887641906738,
+      "learning_rate": 6.345560742867938e-06,
+      "loss": 0.1041,
+      "step": 1185
+    },
+    {
+      "epoch": 2.1442741208295764,
+      "grad_norm": 1.1320112943649292,
+      "learning_rate": 6.224265021070383e-06,
+      "loss": 0.0997,
+      "step": 1190
+    },
+    {
+      "epoch": 2.1532912533814246,
+      "grad_norm": 1.1482691764831543,
+      "learning_rate": 6.103835574591628e-06,
+      "loss": 0.103,
+      "step": 1195
+    },
+    {
+      "epoch": 2.1623083859332732,
+      "grad_norm": 0.9989617466926575,
+      "learning_rate": 5.98428429133135e-06,
+      "loss": 0.0928,
+      "step": 1200
+    },
+    {
+      "epoch": 2.171325518485122,
+      "grad_norm": 1.2064361572265625,
+      "learning_rate": 5.865622972503492e-06,
+      "loss": 0.1089,
+      "step": 1205
+    },
+    {
+      "epoch": 2.18034265103697,
+      "grad_norm": 1.177975058555603,
+      "learning_rate": 5.747863331471339e-06,
+      "loss": 0.101,
+      "step": 1210
+    },
+    {
+      "epoch": 2.1893597835888188,
+      "grad_norm": 1.1105345487594604,
+      "learning_rate": 5.6310169925912305e-06,
+      "loss": 0.1014,
+      "step": 1215
+    },
+    {
+      "epoch": 2.1983769161406674,
+      "grad_norm": 0.9137780666351318,
+      "learning_rate": 5.515095490065111e-06,
+      "loss": 0.0888,
+      "step": 1220
+    },
+    {
+      "epoch": 2.2073940486925157,
+      "grad_norm": 1.0535608530044556,
+      "learning_rate": 5.400110266801948e-06,
+      "loss": 0.1054,
+      "step": 1225
+    },
+    {
+      "epoch": 2.2164111812443643,
+      "grad_norm": 0.987498939037323,
+      "learning_rate": 5.286072673288202e-06,
+      "loss": 0.0949,
+      "step": 1230
+    },
+    {
+      "epoch": 2.225428313796213,
+      "grad_norm": 0.9570640921592712,
+      "learning_rate": 5.17299396646735e-06,
+      "loss": 0.0967,
+      "step": 1235
+    },
+    {
+      "epoch": 2.234445446348061,
+      "grad_norm": 0.9348217844963074,
+      "learning_rate": 5.060885308628707e-06,
+      "loss": 0.1035,
+      "step": 1240
+    },
+    {
+      "epoch": 2.24346257889991,
+      "grad_norm": 0.8263827562332153,
+      "learning_rate": 4.9497577663055645e-06,
+      "loss": 0.0905,
+      "step": 1245
+    },
+    {
+      "epoch": 2.2524797114517585,
+      "grad_norm": 0.7331293225288391,
+      "learning_rate": 4.839622309182789e-06,
+      "loss": 0.0959,
+      "step": 1250
+    },
+    {
+      "epoch": 2.2614968440036067,
+      "grad_norm": 0.9344555735588074,
+      "learning_rate": 4.730489809013962e-06,
+      "loss": 0.0906,
+      "step": 1255
+    },
+    {
+      "epoch": 2.2705139765554554,
+      "grad_norm": 0.9602235555648804,
+      "learning_rate": 4.622371038548217e-06,
+      "loss": 0.0948,
+      "step": 1260
+    },
+    {
+      "epoch": 2.279531109107304,
+      "grad_norm": 0.8398227691650391,
+      "learning_rate": 4.515276670466819e-06,
+      "loss": 0.0887,
+      "step": 1265
+    },
+    {
+      "epoch": 2.2885482416591523,
+      "grad_norm": 1.054457426071167,
+      "learning_rate": 4.409217276329666e-06,
+      "loss": 0.1024,
+      "step": 1270
+    },
+    {
+      "epoch": 2.297565374211001,
+      "grad_norm": 1.0311795473098755,
+      "learning_rate": 4.304203325531697e-06,
+      "loss": 0.0969,
+      "step": 1275
+    },
+    {
+      "epoch": 2.3065825067628496,
+      "grad_norm": 0.7730277180671692,
+      "learning_rate": 4.20024518426947e-06,
+      "loss": 0.0902,
+      "step": 1280
+    },
+    {
+      "epoch": 2.315599639314698,
+      "grad_norm": 0.8550175428390503,
+      "learning_rate": 4.097353114517865e-06,
+      "loss": 0.1017,
+      "step": 1285
+    },
+    {
+      "epoch": 2.3246167718665465,
+      "grad_norm": 0.866615355014801,
+      "learning_rate": 3.995537273017124e-06,
+      "loss": 0.0802,
+      "step": 1290
+    },
+    {
+      "epoch": 2.333633904418395,
+      "grad_norm": 0.9264522194862366,
+      "learning_rate": 3.894807710270224e-06,
+      "loss": 0.0892,
+      "step": 1295
+    },
+    {
+      "epoch": 2.3426510369702434,
+      "grad_norm": 1.0422756671905518,
+      "learning_rate": 3.7951743695507766e-06,
+      "loss": 0.0917,
+      "step": 1300
+    },
+    {
+      "epoch": 2.351668169522092,
+      "grad_norm": 1.167747974395752,
+      "learning_rate": 3.6966470859215116e-06,
+      "loss": 0.0817,
+      "step": 1305
+    },
+    {
+      "epoch": 2.3606853020739407,
+      "grad_norm": 0.7370645403862,
+      "learning_rate": 3.599235585263424e-06,
+      "loss": 0.0896,
+      "step": 1310
+    },
+    {
+      "epoch": 2.369702434625789,
+      "grad_norm": 0.8926695585250854,
+      "learning_rate": 3.502949483315705e-06,
+      "loss": 0.0817,
+      "step": 1315
+    },
+    {
+      "epoch": 2.3787195671776376,
+      "grad_norm": 1.043579339981079,
+      "learning_rate": 3.4077982847265547e-06,
+      "loss": 0.0901,
+      "step": 1320
+    },
+    {
+      "epoch": 2.387736699729486,
+      "grad_norm": 0.8871545195579529,
+      "learning_rate": 3.313791382114943e-06,
+      "loss": 0.0794,
+      "step": 1325
+    },
+    {
+      "epoch": 2.3967538322813344,
+      "grad_norm": 0.9291399717330933,
+      "learning_rate": 3.220938055143464e-06,
+      "loss": 0.083,
+      "step": 1330
+    },
+    {
+      "epoch": 2.405770964833183,
+      "grad_norm": 0.8532813787460327,
+      "learning_rate": 3.129247469602284e-06,
+      "loss": 0.0838,
+      "step": 1335
+    },
+    {
+      "epoch": 2.4147880973850318,
+      "grad_norm": 0.7937539219856262,
+      "learning_rate": 3.038728676504384e-06,
+      "loss": 0.0888,
+      "step": 1340
+    },
+    {
+      "epoch": 2.42380522993688,
+      "grad_norm": 0.8024986982345581,
+      "learning_rate": 2.9493906111921067e-06,
+      "loss": 0.0925,
+      "step": 1345
+    },
+    {
+      "epoch": 2.4328223624887286,
+      "grad_norm": 1.108996033668518,
+      "learning_rate": 2.8612420924551307e-06,
+      "loss": 0.0941,
+      "step": 1350
+    },
+    {
+      "epoch": 2.4418394950405773,
+      "grad_norm": 1.0708988904953003,
+      "learning_rate": 2.7742918216599294e-06,
+      "loss": 0.0853,
+      "step": 1355
+    },
+    {
+      "epoch": 2.4508566275924255,
+      "grad_norm": 0.8975257277488708,
+      "learning_rate": 2.688548381890859e-06,
+      "loss": 0.0897,
+      "step": 1360
+    },
+    {
+      "epoch": 2.459873760144274,
+      "grad_norm": 0.7721099853515625,
+      "learning_rate": 2.6040202371028683e-06,
+      "loss": 0.0887,
+      "step": 1365
+    },
+    {
+      "epoch": 2.4688908926961224,
+      "grad_norm": 0.9484956860542297,
+      "learning_rate": 2.520715731286046e-06,
+      "loss": 0.088,
+      "step": 1370
+    },
+    {
+      "epoch": 2.477908025247971,
+      "grad_norm": 0.8486933708190918,
+      "learning_rate": 2.438643087641918e-06,
+      "loss": 0.0981,
+      "step": 1375
+    },
+    {
+      "epoch": 2.4869251577998197,
+      "grad_norm": 0.8852415680885315,
+      "learning_rate": 2.357810407771745e-06,
+      "loss": 0.075,
+      "step": 1380
+    },
+    {
+      "epoch": 2.4959422903516684,
+      "grad_norm": 0.8159101009368896,
+      "learning_rate": 2.278225670876773e-06,
+      "loss": 0.0789,
+      "step": 1385
+    },
+    {
+      "epoch": 2.5049594229035166,
+      "grad_norm": 0.9947654008865356,
+      "learning_rate": 2.199896732970608e-06,
+      "loss": 0.0975,
+      "step": 1390
+    },
+    {
+      "epoch": 2.5139765554553652,
+      "grad_norm": 0.7149386405944824,
+      "learning_rate": 2.1228313261037024e-06,
+      "loss": 0.0868,
+      "step": 1395
+    },
+    {
+      "epoch": 2.5229936880072135,
+      "grad_norm": 0.7576277852058411,
+      "learning_rate": 2.0470370576001213e-06,
+      "loss": 0.0793,
+      "step": 1400
+    },
+    {
+      "epoch": 2.532010820559062,
+      "grad_norm": 0.6867720484733582,
+      "learning_rate": 1.9725214093066003e-06,
+      "loss": 0.0806,
+      "step": 1405
+    },
+    {
+      "epoch": 2.541027953110911,
+      "grad_norm": 0.8887751698493958,
+      "learning_rate": 1.8992917368539969e-06,
+      "loss": 0.0842,
+      "step": 1410
+    },
+    {
+      "epoch": 2.5500450856627594,
+      "grad_norm": 0.8664789199829102,
+      "learning_rate": 1.827355268931194e-06,
+      "loss": 0.076,
+      "step": 1415
+    },
+    {
+      "epoch": 2.5590622182146077,
+      "grad_norm": 1.0679939985275269,
+      "learning_rate": 1.7567191065715244e-06,
+      "loss": 0.0792,
+      "step": 1420
+    },
+    {
+      "epoch": 2.5680793507664563,
+      "grad_norm": 0.7477567791938782,
+      "learning_rate": 1.6873902224518307e-06,
+      "loss": 0.0841,
+      "step": 1425
+    },
+    {
+      "epoch": 2.5770964833183045,
+      "grad_norm": 0.7139832973480225,
+      "learning_rate": 1.6193754602041722e-06,
+      "loss": 0.0763,
+      "step": 1430
+    },
+    {
+      "epoch": 2.586113615870153,
+      "grad_norm": 0.7922085523605347,
+      "learning_rate": 1.552681533740259e-06,
+      "loss": 0.0802,
+      "step": 1435
+    },
+    {
+      "epoch": 2.595130748422002,
+      "grad_norm": 0.7461308240890503,
+      "learning_rate": 1.4873150265887114e-06,
+      "loss": 0.0755,
+      "step": 1440
+    },
+    {
+      "epoch": 2.6041478809738505,
+      "grad_norm": 0.8084311485290527,
+      "learning_rate": 1.4232823912451808e-06,
+      "loss": 0.0904,
+      "step": 1445
+    },
+    {
+      "epoch": 2.6131650135256987,
+      "grad_norm": 0.7800841927528381,
+      "learning_rate": 1.36058994853542e-06,
+      "loss": 0.0864,
+      "step": 1450
+    },
+    {
+      "epoch": 2.6221821460775474,
+      "grad_norm": 0.7656964659690857,
+      "learning_rate": 1.2992438869913192e-06,
+      "loss": 0.0837,
+      "step": 1455
+    },
+    {
+      "epoch": 2.6311992786293956,
+      "grad_norm": 0.7480539679527283,
+      "learning_rate": 1.239250262240028e-06,
+      "loss": 0.0766,
+      "step": 1460
+    },
+    {
+      "epoch": 2.6402164111812443,
+      "grad_norm": 0.99371337890625,
+      "learning_rate": 1.1806149964061925e-06,
+      "loss": 0.074,
+      "step": 1465
+    },
+    {
+      "epoch": 2.649233543733093,
+      "grad_norm": 0.9161285161972046,
+      "learning_rate": 1.1233438775273713e-06,
+      "loss": 0.0723,
+      "step": 1470
+    },
+    {
+      "epoch": 2.6582506762849416,
+      "grad_norm": 0.7587339282035828,
+      "learning_rate": 1.0674425589826615e-06,
+      "loss": 0.0851,
+      "step": 1475
+    },
+    {
+      "epoch": 2.66726780883679,
+      "grad_norm": 0.7067431211471558,
+      "learning_rate": 1.0129165589346644e-06,
+      "loss": 0.068,
+      "step": 1480
+    },
+    {
+      "epoch": 2.6762849413886385,
+      "grad_norm": 0.7943369746208191,
+      "learning_rate": 9.59771259784762e-07,
+      "loss": 0.0738,
+      "step": 1485
+    },
+    {
+      "epoch": 2.6853020739404867,
+      "grad_norm": 0.916835606098175,
+      "learning_rate": 9.080119076418092e-07,
+      "loss": 0.0844,
+      "step": 1490
+    },
+    {
+      "epoch": 2.6943192064923354,
+      "grad_norm": 1.5968210697174072,
+      "learning_rate": 8.576436118042725e-07,
+      "loss": 0.0772,
+      "step": 1495
+    },
+    {
+      "epoch": 2.703336339044184,
+      "grad_norm": 0.6775498390197754,
+      "learning_rate": 8.086713442558812e-07,
+      "loss": 0.0839,
+      "step": 1500
+    },
+    {
+      "epoch": 2.7123534715960327,
+      "grad_norm": 0.7050554156303406,
+      "learning_rate": 7.610999391748302e-07,
+      "loss": 0.0847,
+      "step": 1505
+    },
+    {
+      "epoch": 2.721370604147881,
+      "grad_norm": 0.7579672336578369,
+      "learning_rate": 7.149340924565867e-07,
+      "loss": 0.0876,
+      "step": 1510
+    },
+    {
+      "epoch": 2.7303877366997296,
+      "grad_norm": 0.6910444498062134,
+      "learning_rate": 6.701783612503437e-07,
+      "loss": 0.0692,
+      "step": 1515
+    },
+    {
+      "epoch": 2.739404869251578,
+      "grad_norm": 0.7011984586715698,
+      "learning_rate": 6.268371635091763e-07,
+      "loss": 0.0798,
+      "step": 1520
+    },
+    {
+      "epoch": 2.7484220018034264,
+      "grad_norm": 0.905360221862793,
+      "learning_rate": 5.849147775539254e-07,
+      "loss": 0.0881,
+      "step": 1525
+    },
+    {
+      "epoch": 2.757439134355275,
+      "grad_norm": 0.8783424496650696,
+      "learning_rate": 5.444153416508873e-07,
+      "loss": 0.1002,
+      "step": 1530
+    },
+    {
+      "epoch": 2.7664562669071238,
+      "grad_norm": 0.7183735370635986,
+      "learning_rate": 5.05342853603304e-07,
+      "loss": 0.085,
+      "step": 1535
+    },
+    {
+      "epoch": 2.775473399458972,
+      "grad_norm": 0.6247619390487671,
+      "learning_rate": 4.6770117035672967e-07,
+      "loss": 0.0791,
+      "step": 1540
+    },
+    {
+      "epoch": 2.7844905320108206,
+      "grad_norm": 1.186852216720581,
+      "learning_rate": 4.3149400761830825e-07,
+      "loss": 0.0785,
+      "step": 1545
+    },
+    {
+      "epoch": 2.793507664562669,
+      "grad_norm": 0.7776186466217041,
+      "learning_rate": 3.9672493948998743e-07,
+      "loss": 0.0837,
+      "step": 1550
+    },
+    {
+      "epoch": 2.8025247971145175,
+      "grad_norm": 0.6376893520355225,
+      "learning_rate": 3.633973981156996e-07,
+      "loss": 0.0746,
+      "step": 1555
+    },
+    {
+      "epoch": 2.811541929666366,
+      "grad_norm": 0.8335205316543579,
+      "learning_rate": 3.315146733425728e-07,
+      "loss": 0.0851,
+      "step": 1560
+    },
+    {
+      "epoch": 2.820559062218215,
+      "grad_norm": 0.8827762007713318,
+      "learning_rate": 3.010799123961805e-07,
+      "loss": 0.0787,
+      "step": 1565
+    },
+    {
+      "epoch": 2.829576194770063,
+      "grad_norm": 0.8129012584686279,
+      "learning_rate": 2.7209611956987843e-07,
+      "loss": 0.0774,
+      "step": 1570
+    },
+    {
+      "epoch": 2.8385933273219117,
+      "grad_norm": 0.7518905401229858,
+      "learning_rate": 2.445661559282286e-07,
+      "loss": 0.0711,
+      "step": 1575
+    },
+    {
+      "epoch": 2.84761045987376,
+      "grad_norm": 0.6657513380050659,
+      "learning_rate": 2.1849273902458167e-07,
+      "loss": 0.0734,
+      "step": 1580
+    },
+    {
+      "epoch": 2.8566275924256086,
+      "grad_norm": 0.7317426204681396,
+      "learning_rate": 1.9387844263282993e-07,
+      "loss": 0.0698,
+      "step": 1585
+    },
+    {
+      "epoch": 2.8656447249774573,
+      "grad_norm": 0.7263646125793457,
+      "learning_rate": 1.707256964933307e-07,
+      "loss": 0.0663,
+      "step": 1590
+    },
+    {
+      "epoch": 2.874661857529306,
+      "grad_norm": 0.7074626088142395,
+      "learning_rate": 1.4903678607306626e-07,
+      "loss": 0.0805,
+      "step": 1595
+    },
+    {
+      "epoch": 2.883678990081154,
+      "grad_norm": 0.8161285519599915,
+      "learning_rate": 1.288138523400395e-07,
+      "loss": 0.0731,
+      "step": 1600
+    },
+    {
+      "epoch": 2.892696122633003,
+      "grad_norm": 0.7892455458641052,
+      "learning_rate": 1.1005889155193294e-07,
+      "loss": 0.083,
+      "step": 1605
+    },
+    {
+      "epoch": 2.901713255184851,
+      "grad_norm": 0.7685972452163696,
+      "learning_rate": 9.277375505905128e-08,
+      "loss": 0.0727,
+      "step": 1610
+    },
+    {
+      "epoch": 2.9107303877366997,
+      "grad_norm": 0.7566655874252319,
+      "learning_rate": 7.696014912157267e-08,
+      "loss": 0.0782,
+      "step": 1615
+    },
+    {
+      "epoch": 2.9197475202885483,
+      "grad_norm": 0.7331447601318359,
+      "learning_rate": 6.261963474111842e-08,
+      "loss": 0.071,
+      "step": 1620
+    },
+    {
+      "epoch": 2.928764652840397,
+      "grad_norm": 0.7547169327735901,
+      "learning_rate": 4.9753627506659574e-08,
+      "loss": 0.0814,
+      "step": 1625
+    },
+    {
+      "epoch": 2.937781785392245,
+      "grad_norm": 0.7907186150550842,
+      "learning_rate": 3.8363397454788674e-08,
+      "loss": 0.0807,
+      "step": 1630
+    },
+    {
+      "epoch": 2.946798917944094,
+      "grad_norm": 0.7324758768081665,
+      "learning_rate": 2.8450068944338436e-08,
+      "loss": 0.0817,
+      "step": 1635
+    },
+    {
+      "epoch": 2.955816050495942,
+      "grad_norm": 0.7154582142829895,
+      "learning_rate": 2.0014620545407147e-08,
+      "loss": 0.0811,
+      "step": 1640
+    },
+    {
+      "epoch": 2.9648331830477908,
+      "grad_norm": 0.586340069770813,
+      "learning_rate": 1.3057884942749288e-08,
+      "loss": 0.0705,
+      "step": 1645
+    },
+    {
+      "epoch": 2.9738503155996394,
+      "grad_norm": 0.7645546793937683,
+      "learning_rate": 7.580548853589608e-09,
+      "loss": 0.0815,
+      "step": 1650
+    },
+    {
+      "epoch": 2.982867448151488,
+      "grad_norm": 0.7248897552490234,
+      "learning_rate": 3.583152959829028e-09,
+      "loss": 0.0677,
+      "step": 1655
+    },
+    {
+      "epoch": 2.9918845807033363,
+      "grad_norm": 0.6330137848854065,
+      "learning_rate": 1.0660918546723243e-09,
+      "loss": 0.0691,
+      "step": 1660
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 1.005300760269165,
+      "learning_rate": 2.961400367929024e-11,
+      "loss": 0.0844,
+      "step": 1665
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1665,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 2000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.1483659137394934e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

141_128_e3_3e-5/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2b3dd1f7eee865d714cb80e402a814f5c846292ea188a6d8dd8bd34993376d5
+size 8209

141_128_e3_3e-5/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

141_128_e3_3e-5/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,604 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_file,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)