S2R-data commited on Aug 8, 2025

Commit

c54f8e4

verified ·

1 Parent(s): b865c7e

Upload folder using huggingface_hub

Browse files

Files changed (48) hide show

added_tokens.json +6 -0
config.json +30 -0
generation_config.json +14 -0
latest +1 -0
merges.txt +0 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +346 -0
rng_state_0.pth +3 -0
rng_state_1.pth +3 -0
rng_state_10.pth +3 -0
rng_state_11.pth +3 -0
rng_state_12.pth +3 -0
rng_state_13.pth +3 -0
rng_state_14.pth +3 -0
rng_state_15.pth +3 -0
rng_state_16.pth +3 -0
rng_state_17.pth +3 -0
rng_state_18.pth +3 -0
rng_state_19.pth +3 -0
rng_state_2.pth +3 -0
rng_state_20.pth +3 -0
rng_state_21.pth +3 -0
rng_state_22.pth +3 -0
rng_state_23.pth +3 -0
rng_state_24.pth +3 -0
rng_state_25.pth +3 -0
rng_state_26.pth +3 -0
rng_state_27.pth +3 -0
rng_state_28.pth +3 -0
rng_state_29.pth +3 -0
rng_state_3.pth +3 -0
rng_state_30.pth +3 -0
rng_state_31.pth +3 -0
rng_state_4.pth +3 -0
rng_state_5.pth +3 -0
rng_state_6.pth +3 -0
rng_state_7.pth +3 -0
rng_state_8.pth +3 -0
rng_state_9.pth +3 -0
special_tokens_map.json +28 -0
tokenizer_config.json +69 -0
trainer_state.json +1400 -0
training_args.bin +3 -0
vocab.json +0 -0
zero_to_fp32.py +587 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "<pad>": 151646,
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "_name_or_path": "/apdcephfs/share_300000800/user/ruotianma/peisongwang/SFT_models/qwen2-4o-0209",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 128245,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "pad_token_id": 151646,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.39.3",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151647
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.05,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.39.3"
+}

latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step164

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65bfa1f9e904465fdd1e5c9b0bf70136100d514556350d55bfad0bb022458d78
+size 4874671720

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:713d289dcdd533a27e46e6653edf3fbcd2030571e587ee29c499bd33ff4fedb7
+size 4932751008

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:addc259caea267234dc69577aa164acecdda4aba568802728391a392eeeed974
+size 4330865200

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2591d4f3abf18e2f94ec5fec8813762f3abce2364a4be7d2e163e732ebe6adf3
+size 1087005824

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,346 @@

+{
+  "metadata": {
+    "total_size": 15225254912
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.norm.weight": "model-00003-of-00004.safetensors"
+  }
+}

rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a59e1e640905115e048364b0cfa68fa3bfefbdca8febdef394c45c93ded68b0
+size 15984

rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7df71a441235abbd1e64480694d0b56512117b852155fb93cb20227e4de559af
+size 15984

rng_state_10.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f2c5016260e380006936f2ab6d27b4d9c5cda870fd960bd22817288010ec1d44
+size 15997

rng_state_11.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bab6bac45dcf2d4bed6c495d52a31b7acd8f82fb277199761afb655ff4aaf366
+size 15997

rng_state_12.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dbddcb471041e1f9b74508c376961c10f5550323d6d162d825127239e287e304
+size 15997

rng_state_13.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d964da2ba74a6cabbdf44f9ba52b77443b21614387e635a6b8f7e43eb54fabc
+size 15997

rng_state_14.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:21bc864b019e3f203cdb7e15ba209193fe38da8981f3ce2902f9a4f691b708d1
+size 15997

rng_state_15.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:207c59f63363d35c16035a6f2302f6ae923b2b00c624316ae81b823a6e93c937
+size 15997

rng_state_16.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d445bad4678571efd4218befa25f57a57e0a089cbec06a13535be3ef9f3e17b
+size 15997

rng_state_17.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b25d9f53663d2ec44b4ab2301b4baf30f0d3976072f0fd97b9cd58959e29fa0
+size 15997

rng_state_18.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4be7a230d2f83da0680c8228d0f21ed84ced0102c0de00e2bf6fab9830e9902c
+size 15997

rng_state_19.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0173347ca42d7fd00045d9511c5703309f931d7b60680c3f8786a2afb20f5e47
+size 15997

rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd7a50311f2723e1ba9ec3204a8eefb656de1745d74c65f3ead609401c588ecd
+size 15984

rng_state_20.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e72216b3a46240b5d93d5065175d4e44f99af1e7d427cdcb81a6a2b3fff5a3d5
+size 15997

rng_state_21.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cabf21ff1c6e3f5bf6eed6baf572c0eca8cb4412c65089bb1b91c7b3df9c57fb
+size 15997

rng_state_22.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8ef8a1f665678be33b59d071c70ba8d2c1a889b864bd10c15a837ad83801555
+size 15997

rng_state_23.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc798f57e440836173fe642e29f131b96dba94da9969322ca0ba9689532c952d
+size 15997

rng_state_24.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c85c53ed56716f36bb3f677c7e4b8aa887f0b3a447b30ff02c36d6193f62d66
+size 15997

rng_state_25.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1cf70fdffa7ecc013e08ccfc50be519da0698e22c154731b1c4d2d199b28679f
+size 15997

rng_state_26.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:008454f3ba3a29e373964e53fb08536a91296bef52277ddea882f2f1cd383b7c
+size 15997

rng_state_27.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:697aff098ebb2e2d6746de57559bb7ce035e2433656e6adcaf9df59d876fb1d6
+size 15997

rng_state_28.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f0e894fcfbaf118c0bc33dcb6a272bc0c04d6ce603570eea8845fdfd84ec286
+size 15997

rng_state_29.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fdebb128cab63bee4ae96bda6ae6d15cc6af5319c9703a7b8c45330cbcebfe1
+size 15997

rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4341b0888644161212b3a485cfe5df553057f148e8e92bd94c55c24c5eb2b71d
+size 15984

rng_state_30.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:32ba025c1f8aade7d0647f594d2c7afa12e73f60e9a8acce808246ae017115fb
+size 15997

rng_state_31.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fe5ee11a16a69e5c858ae6e11a44903fb0ffea3bc3c8e239006832716c9d02c
+size 15997

rng_state_4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ab885de362f89d505131bd168dd9cd5bff3fc279de95a3b1e4869d85a210f1a
+size 15984

rng_state_5.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d20603e331a224643d0fa78f2846a4a30836309c80844973271f2137578e541c
+size 15984

rng_state_6.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74082a41fbf0209d0ed4cfd1806a2abd9f250379e4c2d31d1fbf8eabebd4047b
+size 15984

rng_state_7.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3352495682b940c4595896db4b3b335daa88330267134f75896d995bb97ae19
+size 15984

rng_state_8.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87b2fb1590bb1f46c210e77f0827d1720a161692d866ed70fce7dcd7e98328db
+size 15984

rng_state_9.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8150ac7d0548e8442799f8526d2b28d69182f6a2044c51659ab4f723a317adeb
+size 15984

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,69 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "128244": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128245": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 6000,
+  "pad_token": "<pad>",
+  "padding_side": "left",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "truncation_side": "left",
+  "unk_token": "<unk>"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1400 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.2905869324473976,
+  "eval_steps": 500,
+  "global_step": 164,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01,
+      "importance_ratio": 0.999987781047821,
+      "kl_div_avg": 0.00030031584901735187,
+      "learning_rate": 0.0,
+      "loss_func": "stage2",
+      "step": 1
+    },
+    {
+      "epoch": 0.01,
+      "importance_ratio": 1.0000834465026855,
+      "kl_div_avg": 0.0005344079108908772,
+      "learning_rate": 2.153382790366965e-07,
+      "loss_func": "stage2",
+      "step": 2
+    },
+    {
+      "epoch": 0.01,
+      "importance_ratio": 1.00002121925354,
+      "kl_div_avg": 0.0005927690071985126,
+      "learning_rate": 3.4130309724299266e-07,
+      "loss_func": "stage2",
+      "step": 3
+    },
+    {
+      "epoch": 0.01,
+      "importance_ratio": 0.9999922513961792,
+      "kl_div_avg": 0.0005759165505878627,
+      "learning_rate": 4.30676558073393e-07,
+      "loss_func": "stage2",
+      "step": 4
+    },
+    {
+      "epoch": 0.01,
+      "importance_ratio": 0.9999747276306152,
+      "kl_div_avg": 0.0006470452062785625,
+      "learning_rate": 5e-07,
+      "loss_func": "stage2",
+      "step": 5,
+      "total_loss": -0.6875
+    },
+    {
+      "epoch": 0.01,
+      "importance_ratio": 0.9998948574066162,
+      "kl_div_avg": 0.00059888995019719,
+      "learning_rate": 5e-07,
+      "loss_func": "stage2",
+      "step": 6
+    },
+    {
+      "epoch": 0.01,
+      "importance_ratio": 0.9999293088912964,
+      "kl_div_avg": 0.0008060205727815628,
+      "learning_rate": 4.998225062122825e-07,
+      "loss_func": "stage2",
+      "step": 7,
+      "total_loss": 0.7109375
+    },
+    {
+      "epoch": 0.01,
+      "importance_ratio": 1.0002951622009277,
+      "kl_div_avg": 0.001301910961046815,
+      "learning_rate": 4.996450124245652e-07,
+      "loss_func": "stage2",
+      "step": 8
+    },
+    {
+      "epoch": 0.02,
+      "importance_ratio": 1.0000050067901611,
+      "kl_div_avg": 0.0010975836776196957,
+      "learning_rate": 4.994675186368477e-07,
+      "loss_func": "stage2",
+      "step": 9
+    },
+    {
+      "epoch": 0.02,
+      "importance_ratio": 1.000004768371582,
+      "kl_div_avg": 0.0006192830041982234,
+      "learning_rate": 4.992900248491303e-07,
+      "loss_func": "stage2",
+      "step": 10
+    },
+    {
+      "epoch": 0.02,
+      "importance_ratio": 1.0005085468292236,
+      "kl_div_avg": 0.00045003174454905093,
+      "learning_rate": 4.991125310614128e-07,
+      "loss_func": "stage2",
+      "step": 11
+    },
+    {
+      "epoch": 0.02,
+      "importance_ratio": 1.0013489723205566,
+      "kl_div_avg": 0.0017541071865707636,
+      "learning_rate": 4.989350372736954e-07,
+      "loss_func": "stage2",
+      "step": 12
+    },
+    {
+      "epoch": 0.03,
+      "importance_ratio": 0.9999700784683228,
+      "kl_div_avg": 0.0006275521591305733,
+      "learning_rate": 4.98757543485978e-07,
+      "loss_func": "stage2",
+      "step": 13
+    },
+    {
+      "epoch": 0.03,
+      "importance_ratio": 0.9999388456344604,
+      "kl_div_avg": 0.0024280191864818335,
+      "learning_rate": 4.985800496982605e-07,
+      "loss_func": "stage2",
+      "step": 14
+    },
+    {
+      "epoch": 0.03,
+      "importance_ratio": 1.0000910758972168,
+      "kl_div_avg": 0.0032281712628901005,
+      "learning_rate": 4.984025559105431e-07,
+      "loss_func": "stage2",
+      "step": 15
+    },
+    {
+      "epoch": 0.03,
+      "importance_ratio": 1.0002039670944214,
+      "kl_div_avg": 0.0023550165351480246,
+      "learning_rate": 4.982250621228256e-07,
+      "loss_func": "stage2",
+      "step": 16
+    },
+    {
+      "epoch": 0.04,
+      "importance_ratio": 0.9999492764472961,
+      "kl_div_avg": 0.0031565451063215733,
+      "learning_rate": 4.980475683351083e-07,
+      "loss_func": "stage2",
+      "step": 17
+    },
+    {
+      "epoch": 0.04,
+      "importance_ratio": 1.00008225440979,
+      "kl_div_avg": 0.002413892187178135,
+      "learning_rate": 4.978700745473908e-07,
+      "loss_func": "stage2",
+      "step": 18
+    },
+    {
+      "epoch": 0.04,
+      "importance_ratio": 1.0003724098205566,
+      "kl_div_avg": 0.003695876570418477,
+      "learning_rate": 4.976925807596735e-07,
+      "loss_func": "stage2",
+      "step": 19
+    },
+    {
+      "epoch": 0.04,
+      "importance_ratio": 1.000093698501587,
+      "kl_div_avg": 0.0021200496703386307,
+      "learning_rate": 4.975150869719559e-07,
+      "loss_func": "stage2",
+      "step": 20
+    },
+    {
+      "epoch": 0.04,
+      "importance_ratio": 0.9999486804008484,
+      "kl_div_avg": 0.0036798480432480574,
+      "learning_rate": 4.973375931842385e-07,
+      "loss_func": "stage2",
+      "step": 21
+    },
+    {
+      "epoch": 0.04,
+      "importance_ratio": 1.000074863433838,
+      "kl_div_avg": 0.00499952444806695,
+      "learning_rate": 4.971600993965211e-07,
+      "loss_func": "stage2",
+      "step": 22
+    },
+    {
+      "epoch": 0.04,
+      "importance_ratio": 1.0000168085098267,
+      "kl_div_avg": 0.00605671014636755,
+      "learning_rate": 4.969826056088036e-07,
+      "loss_func": "stage2",
+      "step": 23
+    },
+    {
+      "epoch": 0.04,
+      "importance_ratio": 0.9998180270195007,
+      "kl_div_avg": 0.002726056147366762,
+      "learning_rate": 4.968051118210863e-07,
+      "loss_func": "stage2",
+      "step": 24
+    },
+    {
+      "epoch": 0.05,
+      "importance_ratio": 1.0000224113464355,
+      "kl_div_avg": 0.004386036656796932,
+      "learning_rate": 4.966276180333688e-07,
+      "loss_func": "stage2",
+      "step": 25,
+      "total_loss": 1.25
+    },
+    {
+      "epoch": 0.05,
+      "importance_ratio": 1.0000128746032715,
+      "kl_div_avg": 0.004579769913107157,
+      "learning_rate": 4.964501242456514e-07,
+      "loss_func": "stage2",
+      "step": 26,
+      "total_loss": -1.4375
+    },
+    {
+      "epoch": 0.05,
+      "importance_ratio": 1.0004100799560547,
+      "kl_div_avg": 0.0037285620346665382,
+      "learning_rate": 4.962726304579339e-07,
+      "loss_func": "stage2",
+      "step": 27,
+      "total_loss": -1.5078125
+    },
+    {
+      "epoch": 0.05,
+      "importance_ratio": 1.0007517337799072,
+      "kl_div_avg": 0.0051081208512187,
+      "learning_rate": 4.960951366702166e-07,
+      "loss_func": "stage2",
+      "step": 28,
+      "total_loss": 1.390625
+    },
+    {
+      "epoch": 0.06,
+      "importance_ratio": 1.0001684427261353,
+      "kl_div_avg": 0.006544313859194517,
+      "learning_rate": 4.959176428824991e-07,
+      "loss_func": "stage2",
+      "step": 29,
+      "total_loss": 0.29296875
+    },
+    {
+      "epoch": 0.06,
+      "importance_ratio": 0.9997209310531616,
+      "kl_div_avg": 0.005731683224439621,
+      "learning_rate": 4.957401490947816e-07,
+      "loss_func": "stage2",
+      "step": 30,
+      "total_loss": -1.8203125
+    },
+    {
+      "epoch": 0.06,
+      "importance_ratio": 0.9997566342353821,
+      "kl_div_avg": 0.006199344992637634,
+      "learning_rate": 4.955626553070642e-07,
+      "loss_func": "stage2",
+      "step": 31,
+      "total_loss": 0.27734375
+    },
+    {
+      "epoch": 0.06,
+      "importance_ratio": 0.9999147653579712,
+      "kl_div_avg": 0.006669708527624607,
+      "learning_rate": 4.953851615193468e-07,
+      "loss_func": "stage2",
+      "step": 32,
+      "total_loss": 0.69140625
+    },
+    {
+      "epoch": 0.06,
+      "importance_ratio": 1.0000065565109253,
+      "kl_div_avg": 0.006318950094282627,
+      "learning_rate": 4.952076677316294e-07,
+      "loss_func": "stage2",
+      "step": 33
+    },
+    {
+      "epoch": 0.06,
+      "importance_ratio": 0.9999251365661621,
+      "kl_div_avg": 0.00437127472832799,
+      "learning_rate": 4.950301739439119e-07,
+      "loss_func": "stage2",
+      "step": 34
+    },
+    {
+      "epoch": 0.06,
+      "importance_ratio": 0.9998654127120972,
+      "kl_div_avg": 0.005642242729663849,
+      "learning_rate": 4.948526801561946e-07,
+      "loss_func": "stage2",
+      "step": 35
+    },
+    {
+      "epoch": 0.06,
+      "importance_ratio": 0.9998089671134949,
+      "kl_div_avg": 0.005031378474086523,
+      "learning_rate": 4.946751863684771e-07,
+      "loss_func": "stage2",
+      "step": 36
+    },
+    {
+      "epoch": 0.07,
+      "importance_ratio": 1.000056266784668,
+      "kl_div_avg": 0.003056820947676897,
+      "learning_rate": 4.944976925807596e-07,
+      "loss_func": "stage2",
+      "step": 37
+    },
+    {
+      "epoch": 0.07,
+      "importance_ratio": 0.9996089935302734,
+      "kl_div_avg": 0.006132831797003746,
+      "learning_rate": 4.943201987930422e-07,
+      "loss_func": "stage2",
+      "step": 38
+    },
+    {
+      "epoch": 0.07,
+      "importance_ratio": 0.9995301365852356,
+      "kl_div_avg": 0.007893526926636696,
+      "learning_rate": 4.941427050053248e-07,
+      "loss_func": "stage2",
+      "step": 39
+    },
+    {
+      "epoch": 0.07,
+      "importance_ratio": 0.9994716644287109,
+      "kl_div_avg": 0.0066243866458535194,
+      "learning_rate": 4.939652112176074e-07,
+      "loss_func": "stage2",
+      "step": 40
+    },
+    {
+      "epoch": 0.08,
+      "importance_ratio": 0.9999615550041199,
+      "kl_div_avg": 0.0057721324265003204,
+      "learning_rate": 4.937877174298899e-07,
+      "loss_func": "stage2",
+      "step": 41
+    },
+    {
+      "epoch": 0.08,
+      "importance_ratio": 0.9999039173126221,
+      "kl_div_avg": 0.0047083706595003605,
+      "learning_rate": 4.936102236421725e-07,
+      "loss_func": "stage2",
+      "step": 42
+    },
+    {
+      "epoch": 0.08,
+      "importance_ratio": 1.0000641345977783,
+      "kl_div_avg": 0.004748873878270388,
+      "learning_rate": 4.93432729854455e-07,
+      "loss_func": "stage2",
+      "step": 43
+    },
+    {
+      "epoch": 0.08,
+      "importance_ratio": 1.0001894235610962,
+      "kl_div_avg": 0.005917009431868792,
+      "learning_rate": 4.932552360667377e-07,
+      "loss_func": "stage2",
+      "step": 44
+    },
+    {
+      "epoch": 0.09,
+      "importance_ratio": 0.9999311566352844,
+      "kl_div_avg": 0.0092976875603199,
+      "learning_rate": 4.930777422790202e-07,
+      "loss_func": "stage2",
+      "step": 45
+    },
+    {
+      "epoch": 0.09,
+      "importance_ratio": 1.0000026226043701,
+      "kl_div_avg": 0.005377490073442459,
+      "learning_rate": 4.929002484913027e-07,
+      "loss_func": "stage2",
+      "step": 46
+    },
+    {
+      "epoch": 0.09,
+      "importance_ratio": 1.0002326965332031,
+      "kl_div_avg": 0.004946079570800066,
+      "learning_rate": 4.927227547035854e-07,
+      "loss_func": "stage2",
+      "step": 47
+    },
+    {
+      "epoch": 0.09,
+      "importance_ratio": 1.0004756450653076,
+      "kl_div_avg": 0.00948173925280571,
+      "learning_rate": 4.925452609158679e-07,
+      "loss_func": "stage2",
+      "step": 48
+    },
+    {
+      "epoch": 0.09,
+      "importance_ratio": 1.0000537633895874,
+      "kl_div_avg": 0.006274993997067213,
+      "learning_rate": 4.923677671281505e-07,
+      "loss_func": "stage2",
+      "step": 49,
+      "total_loss": -0.13671875
+    },
+    {
+      "epoch": 0.09,
+      "importance_ratio": 1.0003995895385742,
+      "kl_div_avg": 0.00666253874078393,
+      "learning_rate": 4.92190273340433e-07,
+      "loss_func": "stage2",
+      "step": 50,
+      "total_loss": -1.0
+    },
+    {
+      "epoch": 0.09,
+      "importance_ratio": 1.000571608543396,
+      "kl_div_avg": 0.005418341141194105,
+      "learning_rate": 4.920127795527157e-07,
+      "loss_func": "stage2",
+      "step": 51,
+      "total_loss": 0.984375
+    },
+    {
+      "epoch": 0.09,
+      "importance_ratio": 1.0003688335418701,
+      "kl_div_avg": 0.0072724176570773125,
+      "learning_rate": 4.918352857649982e-07,
+      "loss_func": "stage2",
+      "step": 52
+    },
+    {
+      "epoch": 0.1,
+      "importance_ratio": 0.9999268054962158,
+      "kl_div_avg": 0.006783046759665012,
+      "learning_rate": 4.916577919772808e-07,
+      "loss_func": "stage2",
+      "step": 53,
+      "total_loss": 1.6015625
+    },
+    {
+      "epoch": 0.1,
+      "importance_ratio": 0.9999312162399292,
+      "kl_div_avg": 0.008320106193423271,
+      "learning_rate": 4.914802981895633e-07,
+      "loss_func": "stage2",
+      "step": 54,
+      "total_loss": -1.078125
+    },
+    {
+      "epoch": 0.1,
+      "importance_ratio": 1.0003933906555176,
+      "kl_div_avg": 0.007337601855397224,
+      "learning_rate": 4.91302804401846e-07,
+      "loss_func": "stage2",
+      "step": 55,
+      "total_loss": -1.03125
+    },
+    {
+      "epoch": 0.1,
+      "importance_ratio": 1.0005271434783936,
+      "kl_div_avg": 0.007942966185510159,
+      "learning_rate": 4.911253106141285e-07,
+      "loss_func": "stage2",
+      "step": 56
+    },
+    {
+      "epoch": 0.11,
+      "importance_ratio": 0.9998326301574707,
+      "kl_div_avg": 0.0074424054473638535,
+      "learning_rate": 4.90947816826411e-07,
+      "loss_func": "stage2",
+      "step": 57,
+      "total_loss": -0.30859375
+    },
+    {
+      "epoch": 0.11,
+      "importance_ratio": 1.000032901763916,
+      "kl_div_avg": 0.0086433794349432,
+      "learning_rate": 4.907703230386937e-07,
+      "loss_func": "stage2",
+      "step": 58,
+      "total_loss": 0.92578125
+    },
+    {
+      "epoch": 0.11,
+      "importance_ratio": 1.0001921653747559,
+      "kl_div_avg": 0.006375900469720364,
+      "learning_rate": 4.905928292509761e-07,
+      "loss_func": "stage2",
+      "step": 59
+    },
+    {
+      "epoch": 0.11,
+      "importance_ratio": 1.0001673698425293,
+      "kl_div_avg": 0.006327753886580467,
+      "learning_rate": 4.904153354632588e-07,
+      "loss_func": "stage2",
+      "step": 60,
+      "total_loss": -0.3828125
+    },
+    {
+      "epoch": 0.11,
+      "importance_ratio": 1.0000410079956055,
+      "kl_div_avg": 0.006148340180516243,
+      "learning_rate": 4.902378416755413e-07,
+      "loss_func": "stage2",
+      "step": 61,
+      "total_loss": -1.125
+    },
+    {
+      "epoch": 0.11,
+      "importance_ratio": 1.0002260208129883,
+      "kl_div_avg": 0.008232634514570236,
+      "learning_rate": 4.90060347887824e-07,
+      "loss_func": "stage2",
+      "step": 62,
+      "total_loss": 0.7109375
+    },
+    {
+      "epoch": 0.11,
+      "importance_ratio": 1.0003855228424072,
+      "kl_div_avg": 0.007037755101919174,
+      "learning_rate": 4.898828541001065e-07,
+      "loss_func": "stage2",
+      "step": 63,
+      "total_loss": -1.046875
+    },
+    {
+      "epoch": 0.11,
+      "importance_ratio": 1.0001959800720215,
+      "kl_div_avg": 0.006870637647807598,
+      "learning_rate": 4.89705360312389e-07,
+      "loss_func": "stage2",
+      "step": 64
+    },
+    {
+      "epoch": 0.12,
+      "importance_ratio": 0.9997708797454834,
+      "kl_div_avg": 0.012268968857824802,
+      "learning_rate": 4.895278665246716e-07,
+      "loss_func": "stage2",
+      "step": 65
+    },
+    {
+      "epoch": 0.12,
+      "importance_ratio": 0.9996875524520874,
+      "kl_div_avg": 0.006337402388453484,
+      "learning_rate": 4.893503727369541e-07,
+      "loss_func": "stage2",
+      "step": 66,
+      "total_loss": 0.90625
+    },
+    {
+      "epoch": 0.12,
+      "importance_ratio": 0.9995450973510742,
+      "kl_div_avg": 0.005748244933784008,
+      "learning_rate": 4.891728789492368e-07,
+      "loss_func": "stage2",
+      "step": 67,
+      "total_loss": -0.5078125
+    },
+    {
+      "epoch": 0.12,
+      "importance_ratio": 0.9986073970794678,
+      "kl_div_avg": 0.011714652180671692,
+      "learning_rate": 4.889953851615193e-07,
+      "loss_func": "stage2",
+      "step": 68
+    },
+    {
+      "epoch": 0.13,
+      "importance_ratio": 0.9997700452804565,
+      "kl_div_avg": 0.009843539446592331,
+      "learning_rate": 4.888178913738019e-07,
+      "loss_func": "stage2",
+      "step": 69,
+      "total_loss": -0.40234375
+    },
+    {
+      "epoch": 0.13,
+      "importance_ratio": 0.9998771548271179,
+      "kl_div_avg": 0.011152197606861591,
+      "learning_rate": 4.886403975860844e-07,
+      "loss_func": "stage2",
+      "step": 70,
+      "total_loss": -0.380859375
+    },
+    {
+      "epoch": 0.13,
+      "importance_ratio": 0.9998139142990112,
+      "kl_div_avg": 0.010612234473228455,
+      "learning_rate": 4.884629037983671e-07,
+      "loss_func": "stage2",
+      "step": 71,
+      "total_loss": -0.408203125
+    },
+    {
+      "epoch": 0.13,
+      "importance_ratio": 0.9994380474090576,
+      "kl_div_avg": 0.008871862664818764,
+      "learning_rate": 4.882854100106496e-07,
+      "loss_func": "stage2",
+      "step": 72,
+      "total_loss": 1.484375
+    },
+    {
+      "epoch": 0.13,
+      "importance_ratio": 1.000580072402954,
+      "kl_div_avg": 0.01336180604994297,
+      "learning_rate": 4.881079162229321e-07,
+      "loss_func": "stage2",
+      "step": 73,
+      "total_loss": 0.69921875
+    },
+    {
+      "epoch": 0.13,
+      "importance_ratio": 0.9992865324020386,
+      "kl_div_avg": 0.01184939220547676,
+      "learning_rate": 4.879304224352148e-07,
+      "loss_func": "stage2",
+      "step": 74,
+      "total_loss": 0.9609375
+    },
+    {
+      "epoch": 0.13,
+      "importance_ratio": 0.9983202219009399,
+      "kl_div_avg": 0.013325630687177181,
+      "learning_rate": 4.877529286474973e-07,
+      "loss_func": "stage2",
+      "step": 75,
+      "total_loss": 0.99609375
+    },
+    {
+      "epoch": 0.13,
+      "importance_ratio": 0.9995129108428955,
+      "kl_div_avg": 0.01113186962902546,
+      "learning_rate": 4.875754348597799e-07,
+      "loss_func": "stage2",
+      "step": 76,
+      "total_loss": -1.8984375
+    },
+    {
+      "epoch": 0.14,
+      "importance_ratio": 0.9999693632125854,
+      "kl_div_avg": 0.0077874367125332355,
+      "learning_rate": 4.873979410720624e-07,
+      "loss_func": "stage2",
+      "step": 77,
+      "total_loss": -0.3125
+    },
+    {
+      "epoch": 0.14,
+      "importance_ratio": 1.0004016160964966,
+      "kl_div_avg": 0.007076024077832699,
+      "learning_rate": 4.872204472843451e-07,
+      "loss_func": "stage2",
+      "step": 78,
+      "total_loss": -0.4765625
+    },
+    {
+      "epoch": 0.14,
+      "importance_ratio": 1.0008529424667358,
+      "kl_div_avg": 0.008574053645133972,
+      "learning_rate": 4.870429534966276e-07,
+      "loss_func": "stage2",
+      "step": 79
+    },
+    {
+      "epoch": 0.14,
+      "importance_ratio": 1.0009359121322632,
+      "kl_div_avg": 0.006657534744590521,
+      "learning_rate": 4.868654597089102e-07,
+      "loss_func": "stage2",
+      "step": 80,
+      "total_loss": 0.953125
+    },
+    {
+      "epoch": 0.15,
+      "importance_ratio": 0.9999938011169434,
+      "kl_div_avg": 0.01249817293137312,
+      "learning_rate": 4.866879659211927e-07,
+      "loss_func": "stage2",
+      "step": 81,
+      "total_loss": 1.421875
+    },
+    {
+      "epoch": 0.15,
+      "importance_ratio": 1.0006179809570312,
+      "kl_div_avg": 0.011909810826182365,
+      "learning_rate": 4.865104721334753e-07,
+      "loss_func": "stage2",
+      "step": 82,
+      "total_loss": -0.189453125
+    },
+    {
+      "epoch": 0.15,
+      "importance_ratio": 1.0008633136749268,
+      "kl_div_avg": 0.01326713990420103,
+      "learning_rate": 4.863329783457579e-07,
+      "loss_func": "stage2",
+      "step": 83
+    },
+    {
+      "epoch": 0.15,
+      "importance_ratio": 1.0014276504516602,
+      "kl_div_avg": 0.010602062568068504,
+      "learning_rate": 4.861554845580404e-07,
+      "loss_func": "stage2",
+      "step": 84,
+      "total_loss": -0.33203125
+    },
+    {
+      "epoch": 0.16,
+      "importance_ratio": 1.0000020265579224,
+      "kl_div_avg": 0.010696541517972946,
+      "learning_rate": 4.859779907703231e-07,
+      "loss_func": "stage2",
+      "step": 85
+    },
+    {
+      "epoch": 0.16,
+      "importance_ratio": 0.9999282360076904,
+      "kl_div_avg": 0.013704588636755943,
+      "learning_rate": 4.858004969826056e-07,
+      "loss_func": "stage2",
+      "step": 86
+    },
+    {
+      "epoch": 0.16,
+      "importance_ratio": 0.9988713264465332,
+      "kl_div_avg": 0.015348710119724274,
+      "learning_rate": 4.856230031948882e-07,
+      "loss_func": "stage2",
+      "step": 87
+    },
+    {
+      "epoch": 0.16,
+      "importance_ratio": 0.9998327493667603,
+      "kl_div_avg": 0.012932014651596546,
+      "learning_rate": 4.854455094071707e-07,
+      "loss_func": "stage2",
+      "step": 88
+    },
+    {
+      "epoch": 0.16,
+      "importance_ratio": 1.0000953674316406,
+      "kl_div_avg": 0.012728630565106869,
+      "learning_rate": 4.852680156194532e-07,
+      "loss_func": "stage2",
+      "step": 89
+    },
+    {
+      "epoch": 0.16,
+      "importance_ratio": 0.9999107122421265,
+      "kl_div_avg": 0.012436306104063988,
+      "learning_rate": 4.850905218317359e-07,
+      "loss_func": "stage2",
+      "step": 90
+    },
+    {
+      "epoch": 0.16,
+      "importance_ratio": 1.0001928806304932,
+      "kl_div_avg": 0.010830108076334,
+      "learning_rate": 4.849130280440184e-07,
+      "loss_func": "stage2",
+      "step": 91
+    },
+    {
+      "epoch": 0.16,
+      "importance_ratio": 0.9993023872375488,
+      "kl_div_avg": 0.011476454325020313,
+      "learning_rate": 4.84735534256301e-07,
+      "loss_func": "stage2",
+      "step": 92
+    },
+    {
+      "epoch": 0.17,
+      "importance_ratio": 1.0001647472381592,
+      "kl_div_avg": 0.007908498868346214,
+      "learning_rate": 4.845580404685835e-07,
+      "loss_func": "stage2",
+      "step": 93,
+      "total_loss": -0.97265625
+    },
+    {
+      "epoch": 0.17,
+      "importance_ratio": 0.9995372295379639,
+      "kl_div_avg": 0.012396270409226418,
+      "learning_rate": 4.843805466808662e-07,
+      "loss_func": "stage2",
+      "step": 94,
+      "total_loss": 0.328125
+    },
+    {
+      "epoch": 0.17,
+      "importance_ratio": 0.9988420009613037,
+      "kl_div_avg": 0.012851856648921967,
+      "learning_rate": 4.842030528931487e-07,
+      "loss_func": "stage2",
+      "step": 95,
+      "total_loss": 0.251953125
+    },
+    {
+      "epoch": 0.17,
+      "importance_ratio": 0.9984779357910156,
+      "kl_div_avg": 0.010285125114023685,
+      "learning_rate": 4.840255591054313e-07,
+      "loss_func": "stage2",
+      "step": 96
+    },
+    {
+      "epoch": 0.18,
+      "importance_ratio": 1.000007152557373,
+      "kl_div_avg": 0.01037362776696682,
+      "learning_rate": 4.838480653177139e-07,
+      "loss_func": "stage2",
+      "step": 97
+    },
+    {
+      "epoch": 0.18,
+      "importance_ratio": 0.9997274875640869,
+      "kl_div_avg": 0.010967772454023361,
+      "learning_rate": 4.836705715299965e-07,
+      "loss_func": "stage2",
+      "step": 98
+    },
+    {
+      "epoch": 0.18,
+      "importance_ratio": 0.9996849298477173,
+      "kl_div_avg": 0.011234309524297714,
+      "learning_rate": 4.83493077742279e-07,
+      "loss_func": "stage2",
+      "step": 99
+    },
+    {
+      "epoch": 0.18,
+      "importance_ratio": 0.9995231628417969,
+      "kl_div_avg": 0.010765241459012032,
+      "learning_rate": 4.833155839545615e-07,
+      "loss_func": "stage2",
+      "step": 100
+    },
+    {
+      "epoch": 0.18,
+      "importance_ratio": 0.9997826814651489,
+      "kl_div_avg": 0.0107155442237854,
+      "learning_rate": 4.831380901668442e-07,
+      "loss_func": "stage2",
+      "step": 101,
+      "total_loss": 0.8515625
+    },
+    {
+      "epoch": 0.18,
+      "importance_ratio": 1.0002464056015015,
+      "kl_div_avg": 0.010471027344465256,
+      "learning_rate": 4.829605963791267e-07,
+      "loss_func": "stage2",
+      "step": 102,
+      "total_loss": -0.3359375
+    },
+    {
+      "epoch": 0.18,
+      "importance_ratio": 1.000614881515503,
+      "kl_div_avg": 0.009624524042010307,
+      "learning_rate": 4.827831025914093e-07,
+      "loss_func": "stage2",
+      "step": 103,
+      "total_loss": -1.46875
+    },
+    {
+      "epoch": 0.18,
+      "importance_ratio": 1.000823974609375,
+      "kl_div_avg": 0.01463034376502037,
+      "learning_rate": 4.826056088036918e-07,
+      "loss_func": "stage2",
+      "step": 104
+    },
+    {
+      "epoch": 0.19,
+      "importance_ratio": 0.999914288520813,
+      "kl_div_avg": 0.008821999654173851,
+      "learning_rate": 4.824281150159745e-07,
+      "loss_func": "stage2",
+      "step": 105,
+      "total_loss": 0.361328125
+    },
+    {
+      "epoch": 0.19,
+      "importance_ratio": 1.0005583763122559,
+      "kl_div_avg": 0.014264218509197235,
+      "learning_rate": 4.82250621228257e-07,
+      "loss_func": "stage2",
+      "step": 106,
+      "total_loss": 0.96484375
+    },
+    {
+      "epoch": 0.19,
+      "importance_ratio": 1.0007658004760742,
+      "kl_div_avg": 0.006260848604142666,
+      "learning_rate": 4.820731274405395e-07,
+      "loss_func": "stage2",
+      "step": 107,
+      "total_loss": -0.78515625
+    },
+    {
+      "epoch": 0.19,
+      "importance_ratio": 1.001744031906128,
+      "kl_div_avg": 0.010768534615635872,
+      "learning_rate": 4.818956336528222e-07,
+      "loss_func": "stage2",
+      "step": 108,
+      "total_loss": -0.10888671875
+    },
+    {
+      "epoch": 0.2,
+      "importance_ratio": 1.0000841617584229,
+      "kl_div_avg": 0.010653991252183914,
+      "learning_rate": 4.817181398651046e-07,
+      "loss_func": "stage2",
+      "step": 109
+    },
+    {
+      "epoch": 0.2,
+      "importance_ratio": 1.0003538131713867,
+      "kl_div_avg": 0.010306619107723236,
+      "learning_rate": 4.815406460773873e-07,
+      "loss_func": "stage2",
+      "step": 110
+    },
+    {
+      "epoch": 0.2,
+      "importance_ratio": 1.0002071857452393,
+      "kl_div_avg": 0.017675137147307396,
+      "learning_rate": 4.813631522896698e-07,
+      "loss_func": "stage2",
+      "step": 111
+    },
+    {
+      "epoch": 0.2,
+      "importance_ratio": 1.0005652904510498,
+      "kl_div_avg": 0.014808263629674911,
+      "learning_rate": 4.811856585019524e-07,
+      "loss_func": "stage2",
+      "step": 112
+    },
+    {
+      "epoch": 0.21,
+      "importance_ratio": 0.9998864531517029,
+      "kl_div_avg": 0.007133196573704481,
+      "learning_rate": 4.81008164714235e-07,
+      "loss_func": "stage2",
+      "step": 113
+    },
+    {
+      "epoch": 0.21,
+      "importance_ratio": 0.9993472695350647,
+      "kl_div_avg": 0.012266352772712708,
+      "learning_rate": 4.808306709265176e-07,
+      "loss_func": "stage2",
+      "step": 114,
+      "total_loss": -0.9609375
+    },
+    {
+      "epoch": 0.21,
+      "importance_ratio": 0.999112606048584,
+      "kl_div_avg": 0.009540843777358532,
+      "learning_rate": 4.806531771388001e-07,
+      "loss_func": "stage2",
+      "step": 115,
+      "total_loss": 1.390625
+    },
+    {
+      "epoch": 0.21,
+      "importance_ratio": 0.9988101720809937,
+      "kl_div_avg": 0.010600866749882698,
+      "learning_rate": 4.804756833510826e-07,
+      "loss_func": "stage2",
+      "step": 116,
+      "total_loss": 0.68359375
+    },
+    {
+      "epoch": 0.21,
+      "importance_ratio": 1.0000008344650269,
+      "kl_div_avg": 0.007777961902320385,
+      "learning_rate": 4.802981895633653e-07,
+      "loss_func": "stage2",
+      "step": 117
+    },
+    {
+      "epoch": 0.21,
+      "importance_ratio": 0.9992760419845581,
+      "kl_div_avg": 0.008088795468211174,
+      "learning_rate": 4.801206957756478e-07,
+      "loss_func": "stage2",
+      "step": 118
+    },
+    {
+      "epoch": 0.21,
+      "importance_ratio": 0.9990713596343994,
+      "kl_div_avg": 0.009075586684048176,
+      "learning_rate": 4.799432019879304e-07,
+      "loss_func": "stage2",
+      "step": 119
+    },
+    {
+      "epoch": 0.21,
+      "importance_ratio": 0.9986318349838257,
+      "kl_div_avg": 0.0066194236278533936,
+      "learning_rate": 4.797657082002129e-07,
+      "loss_func": "stage2",
+      "step": 120
+    },
+    {
+      "epoch": 0.22,
+      "importance_ratio": 0.9999872446060181,
+      "kl_div_avg": 0.007935302332043648,
+      "learning_rate": 4.795882144124956e-07,
+      "loss_func": "stage2",
+      "step": 121
+    },
+    {
+      "epoch": 0.22,
+      "importance_ratio": 0.9994752407073975,
+      "kl_div_avg": 0.007028103340417147,
+      "learning_rate": 4.794107206247781e-07,
+      "loss_func": "stage2",
+      "step": 122
+    },
+    {
+      "epoch": 0.22,
+      "importance_ratio": 0.9989815354347229,
+      "kl_div_avg": 0.0070693036541342735,
+      "learning_rate": 4.792332268370607e-07,
+      "loss_func": "stage2",
+      "step": 123
+    },
+    {
+      "epoch": 0.22,
+      "importance_ratio": 0.9994947910308838,
+      "kl_div_avg": 0.007683721836656332,
+      "learning_rate": 4.790557330493433e-07,
+      "loss_func": "stage2",
+      "step": 124
+    },
+    {
+      "epoch": 0.23,
+      "importance_ratio": 0.9996501207351685,
+      "kl_div_avg": 0.010995806194841862,
+      "learning_rate": 4.788782392616257e-07,
+      "loss_func": "stage2",
+      "step": 125,
+      "total_loss": -0.55859375
+    },
+    {
+      "epoch": 0.23,
+      "importance_ratio": 1.000380516052246,
+      "kl_div_avg": 0.01340421661734581,
+      "learning_rate": 4.787007454739084e-07,
+      "loss_func": "stage2",
+      "step": 126
+    },
+    {
+      "epoch": 0.23,
+      "importance_ratio": 1.000959873199463,
+      "kl_div_avg": 0.009610550478100777,
+      "learning_rate": 4.785232516861909e-07,
+      "loss_func": "stage2",
+      "step": 127,
+      "total_loss": -0.0625
+    },
+    {
+      "epoch": 0.23,
+      "importance_ratio": 1.0012009143829346,
+      "kl_div_avg": 0.009116636589169502,
+      "learning_rate": 4.783457578984736e-07,
+      "loss_func": "stage2",
+      "step": 128,
+      "total_loss": -0.66015625
+    },
+    {
+      "epoch": 0.23,
+      "importance_ratio": 1.0001189708709717,
+      "kl_div_avg": 0.010150732472538948,
+      "learning_rate": 4.781682641107561e-07,
+      "loss_func": "stage2",
+      "step": 129
+    },
+    {
+      "epoch": 0.23,
+      "importance_ratio": 1.0003772974014282,
+      "kl_div_avg": 0.010458733886480331,
+      "learning_rate": 4.779907703230387e-07,
+      "loss_func": "stage2",
+      "step": 130
+    },
+    {
+      "epoch": 0.23,
+      "importance_ratio": 0.9999233484268188,
+      "kl_div_avg": 0.009825407527387142,
+      "learning_rate": 4.778132765353212e-07,
+      "loss_func": "stage2",
+      "step": 131
+    },
+    {
+      "epoch": 0.23,
+      "importance_ratio": 1.001006007194519,
+      "kl_div_avg": 0.00745510496199131,
+      "learning_rate": 4.776357827476038e-07,
+      "loss_func": "stage2",
+      "step": 132
+    },
+    {
+      "epoch": 0.24,
+      "importance_ratio": 0.9999349117279053,
+      "kl_div_avg": 0.013605897314846516,
+      "learning_rate": 4.774582889598864e-07,
+      "loss_func": "stage2",
+      "step": 133
+    },
+    {
+      "epoch": 0.24,
+      "importance_ratio": 0.9999945163726807,
+      "kl_div_avg": 0.010575266554951668,
+      "learning_rate": 4.772807951721689e-07,
+      "loss_func": "stage2",
+      "step": 134
+    },
+    {
+      "epoch": 0.24,
+      "importance_ratio": 1.0001717805862427,
+      "kl_div_avg": 0.01219017431139946,
+      "learning_rate": 4.771033013844515e-07,
+      "loss_func": "stage2",
+      "step": 135
+    },
+    {
+      "epoch": 0.24,
+      "importance_ratio": 0.9999402165412903,
+      "kl_div_avg": 0.014882136136293411,
+      "learning_rate": 4.769258075967341e-07,
+      "loss_func": "stage2",
+      "step": 136
+    },
+    {
+      "epoch": 0.25,
+      "importance_ratio": 0.9999697208404541,
+      "kl_div_avg": 0.011173544451594353,
+      "learning_rate": 4.767483138090166e-07,
+      "loss_func": "stage2",
+      "step": 137,
+      "total_loss": -0.9765625
+    },
+    {
+      "epoch": 0.25,
+      "importance_ratio": 0.9997889399528503,
+      "kl_div_avg": 0.009266650304198265,
+      "learning_rate": 4.765708200212992e-07,
+      "loss_func": "stage2",
+      "step": 138,
+      "total_loss": 1.1484375
+    },
+    {
+      "epoch": 0.25,
+      "importance_ratio": 0.9998738765716553,
+      "kl_div_avg": 0.00967929232865572,
+      "learning_rate": 4.763933262335818e-07,
+      "loss_func": "stage2",
+      "step": 139,
+      "total_loss": 1.7109375
+    },
+    {
+      "epoch": 0.25,
+      "importance_ratio": 0.9994757771492004,
+      "kl_div_avg": 0.007988542318344116,
+      "learning_rate": 4.7621583244586436e-07,
+      "loss_func": "stage2",
+      "step": 140,
+      "total_loss": -1.609375
+    },
+    {
+      "epoch": 0.26,
+      "importance_ratio": 0.9999532699584961,
+      "kl_div_avg": 0.009091874584555626,
+      "learning_rate": 4.7603833865814696e-07,
+      "loss_func": "stage2",
+      "step": 141
+    },
+    {
+      "epoch": 0.26,
+      "importance_ratio": 0.9995992183685303,
+      "kl_div_avg": 0.011182118207216263,
+      "learning_rate": 4.758608448704295e-07,
+      "loss_func": "stage2",
+      "step": 142,
+      "total_loss": 0.796875
+    },
+    {
+      "epoch": 0.26,
+      "importance_ratio": 0.9996076226234436,
+      "kl_div_avg": 0.015766486525535583,
+      "learning_rate": 4.756833510827121e-07,
+      "loss_func": "stage2",
+      "step": 143
+    },
+    {
+      "epoch": 0.26,
+      "importance_ratio": 0.999364972114563,
+      "kl_div_avg": 0.013693357817828655,
+      "learning_rate": 4.7550585729499464e-07,
+      "loss_func": "stage2",
+      "step": 144,
+      "total_loss": -0.47265625
+    },
+    {
+      "epoch": 0.26,
+      "importance_ratio": 0.9999587535858154,
+      "kl_div_avg": 0.005303369835019112,
+      "learning_rate": 4.7532836350727723e-07,
+      "loss_func": "stage2",
+      "step": 145
+    },
+    {
+      "epoch": 0.26,
+      "importance_ratio": 1.0001214742660522,
+      "kl_div_avg": 0.0063499645330011845,
+      "learning_rate": 4.7515086971955983e-07,
+      "loss_func": "stage2",
+      "step": 146,
+      "total_loss": -1.4375
+    },
+    {
+      "epoch": 0.26,
+      "importance_ratio": 1.0005862712860107,
+      "kl_div_avg": 0.007753903977572918,
+      "learning_rate": 4.7497337593184237e-07,
+      "loss_func": "stage2",
+      "step": 147,
+      "total_loss": 1.953125
+    },
+    {
+      "epoch": 0.26,
+      "importance_ratio": 1.000730037689209,
+      "kl_div_avg": 0.005649554077535868,
+      "learning_rate": 4.747958821441249e-07,
+      "loss_func": "stage2",
+      "step": 148,
+      "total_loss": -1.609375
+    },
+    {
+      "epoch": 0.27,
+      "importance_ratio": 1.0000715255737305,
+      "kl_div_avg": 0.012152642011642456,
+      "learning_rate": 4.746183883564075e-07,
+      "loss_func": "stage2",
+      "step": 149,
+      "total_loss": 0.19140625
+    },
+    {
+      "epoch": 0.27,
+      "importance_ratio": 0.9999151825904846,
+      "kl_div_avg": 0.012796454131603241,
+      "learning_rate": 4.7444089456869005e-07,
+      "loss_func": "stage2",
+      "step": 150,
+      "total_loss": 0.11328125
+    },
+    {
+      "epoch": 0.27,
+      "importance_ratio": 1.0002349615097046,
+      "kl_div_avg": 0.01051037572324276,
+      "learning_rate": 4.7426340078097265e-07,
+      "loss_func": "stage2",
+      "step": 151,
+      "total_loss": -0.109375
+    },
+    {
+      "epoch": 0.27,
+      "importance_ratio": 1.001346230506897,
+      "kl_div_avg": 0.013909703120589256,
+      "learning_rate": 4.7408590699325524e-07,
+      "loss_func": "stage2",
+      "step": 152,
+      "total_loss": 0.0966796875
+    },
+    {
+      "epoch": 0.28,
+      "importance_ratio": 0.9999252557754517,
+      "kl_div_avg": 0.014116976410150528,
+      "learning_rate": 4.739084132055378e-07,
+      "loss_func": "stage2",
+      "step": 153
+    },
+    {
+      "epoch": 0.28,
+      "importance_ratio": 1.0002610683441162,
+      "kl_div_avg": 0.019843310117721558,
+      "learning_rate": 4.7373091941782033e-07,
+      "loss_func": "stage2",
+      "step": 154
+    },
+    {
+      "epoch": 0.28,
+      "importance_ratio": 1.0010803937911987,
+      "kl_div_avg": 0.014465966261923313,
+      "learning_rate": 4.735534256301029e-07,
+      "loss_func": "stage2",
+      "step": 155
+    },
+    {
+      "epoch": 0.28,
+      "importance_ratio": 1.0009663105010986,
+      "kl_div_avg": 0.012785017490386963,
+      "learning_rate": 4.733759318423855e-07,
+      "loss_func": "stage2",
+      "step": 156
+    },
+    {
+      "epoch": 0.28,
+      "importance_ratio": 1.0002055168151855,
+      "kl_div_avg": 0.014406196773052216,
+      "learning_rate": 4.7319843805466806e-07,
+      "loss_func": "stage2",
+      "step": 157
+    },
+    {
+      "epoch": 0.28,
+      "importance_ratio": 0.9999687671661377,
+      "kl_div_avg": 0.01354107353836298,
+      "learning_rate": 4.7302094426695066e-07,
+      "loss_func": "stage2",
+      "step": 158
+    },
+    {
+      "epoch": 0.28,
+      "importance_ratio": 0.9995388984680176,
+      "kl_div_avg": 0.010002201423048973,
+      "learning_rate": 4.728434504792332e-07,
+      "loss_func": "stage2",
+      "step": 159
+    },
+    {
+      "epoch": 0.28,
+      "importance_ratio": 0.9996526837348938,
+      "kl_div_avg": 0.012093874625861645,
+      "learning_rate": 4.7266595669151574e-07,
+      "loss_func": "stage2",
+      "step": 160
+    },
+    {
+      "epoch": 0.29,
+      "importance_ratio": 0.9997421503067017,
+      "kl_div_avg": 0.009743213653564453,
+      "learning_rate": 4.7248846290379834e-07,
+      "loss_func": "stage2",
+      "step": 161
+    },
+    {
+      "epoch": 0.29,
+      "importance_ratio": 0.9999602437019348,
+      "kl_div_avg": 0.010792155750095844,
+      "learning_rate": 4.7231096911608094e-07,
+      "loss_func": "stage2",
+      "step": 162
+    },
+    {
+      "epoch": 0.29,
+      "importance_ratio": 0.9999933242797852,
+      "kl_div_avg": 0.010607779026031494,
+      "learning_rate": 4.7213347532836353e-07,
+      "loss_func": "stage2",
+      "step": 163
+    },
+    {
+      "epoch": 0.29,
+      "importance_ratio": 0.999122142791748,
+      "kl_div_avg": 0.011827336624264717,
+      "learning_rate": 4.71955981540646e-07,
+      "loss_func": "stage2",
+      "step": 164
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 2822,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 20.0,
+  "save_steps": 50,
+  "total_flos": 0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f46e6b8b37276e745292327a73921529fab297b50d103b995177abf1a213fe4
+size 11000

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,587 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel()
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    """
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)