Upload folder using huggingface_hub

Browse files

Files changed (15) hide show

added_tokens.json +40 -0
config.json +31 -0
generation_config.json +4 -0
merges.txt +0 -0
model.safetensors +3 -0
optimizer.pt +3 -0
rng_state_0.pth +3 -0
rng_state_1.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +23 -0
tokenizer.json +0 -0
tokenizer_config.json +326 -0
trainer_state.json +2148 -0
training_args.bin +3 -0
vocab.json +0 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "\t\t": 50294,
+  "\t\t\t": 50293,
+  "\t\t\t\t": 50292,
+  "\t\t\t\t\t": 50291,
+  "\t\t\t\t\t\t": 50290,
+  "\t\t\t\t\t\t\t": 50289,
+  "\t\t\t\t\t\t\t\t": 50288,
+  "\t\t\t\t\t\t\t\t\t": 50287,
+  "  ": 50286,
+  "   ": 50285,
+  "    ": 50284,
+  "     ": 50283,
+  "      ": 50282,
+  "       ": 50281,
+  "        ": 50280,
+  "         ": 50279,
+  "          ": 50278,
+  "           ": 50277,
+  "            ": 50276,
+  "             ": 50275,
+  "              ": 50274,
+  "               ": 50273,
+  "                ": 50272,
+  "                 ": 50271,
+  "                  ": 50270,
+  "                   ": 50269,
+  "                    ": 50268,
+  "                     ": 50267,
+  "                      ": 50266,
+  "                       ": 50265,
+  "                        ": 50264,
+  "                         ": 50263,
+  "                          ": 50262,
+  "                           ": 50261,
+  "                            ": 50260,
+  "                             ": 50259,
+  "                              ": 50258,
+  "                               ": 50257
+}

config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "_name_or_path": "kaizen9/phi-1_5_HQ_6000_200k_FP",
+  "architectures": [
+    "QPhiForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "embd_pdrop": 0.0,
+  "eos_token_id": null,
+  "gradient_checkpointing_step": 7,
+  "hidden_act": "gelu_new",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 2048,
+  "model_type": "phi",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 32,
+  "partial_rotary_factor": 0.5,
+  "qk_layernorm": false,
+  "resid_pdrop": 0.0,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.0",
+  "use_cache": true,
+  "vocab_size": 51200
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.47.0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3bc8d8eb723656398edb6c5ae97d719ce4048f2ab639dbe3427aadbf168a158b
+size 2837479464

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb408dd46d97d56b7b0b68b2a835ed2e21cd62606f5fff0cc650fb0f5c8fa4ff
+size 2051182202

rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65efacbf532f891d581bc44560cc4a191acde3008f06b38c6379dd43be56af76
+size 14512

rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba868f182320f3198afac742a233ba97edc4cc4bffda1245f222b2a7526abbd6
+size 14512

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce2b68abd6c86aa925023746562610bf7f4e284153048eb68a42712f88e800e8
+size 1000

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,326 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50257": {
+      "content": "                               ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50258": {
+      "content": "                              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50259": {
+      "content": "                             ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50260": {
+      "content": "                            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "                           ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50262": {
+      "content": "                          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50263": {
+      "content": "                         ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50264": {
+      "content": "                        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50265": {
+      "content": "                       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "                      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50267": {
+      "content": "                     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50268": {
+      "content": "                    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50269": {
+      "content": "                   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50270": {
+      "content": "                  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50271": {
+      "content": "                 ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50272": {
+      "content": "                ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50273": {
+      "content": "               ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50274": {
+      "content": "              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50275": {
+      "content": "             ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50276": {
+      "content": "            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50277": {
+      "content": "           ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50278": {
+      "content": "          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50279": {
+      "content": "         ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50280": {
+      "content": "        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50281": {
+      "content": "       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50282": {
+      "content": "      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50283": {
+      "content": "     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50284": {
+      "content": "    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50285": {
+      "content": "   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50286": {
+      "content": "  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50287": {
+      "content": "\t\t\t\t\t\t\t\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50288": {
+      "content": "\t\t\t\t\t\t\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50289": {
+      "content": "\t\t\t\t\t\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50290": {
+      "content": "\t\t\t\t\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50291": {
+      "content": "\t\t\t\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50292": {
+      "content": "\t\t\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50293": {
+      "content": "\t\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50294": {
+      "content": "\t\t",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 2048,
+  "padding_side": "right",
+  "return_token_type_ids": false,
+  "tokenizer_class": "CodeGenTokenizer",
+  "unk_token": "<|endoftext|>"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2148 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.08483363182203789,
+  "eval_steps": 300,
+  "global_step": 300,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0002827787727401263,
+      "grad_norm": 0.80859375,
+      "learning_rate": 2.5e-06,
+      "loss": 2.7268,
+      "step": 1
+    },
+    {
+      "epoch": 0.0005655575454802526,
+      "grad_norm": 0.7890625,
+      "learning_rate": 5e-06,
+      "loss": 2.718,
+      "step": 2
+    },
+    {
+      "epoch": 0.000848336318220379,
+      "grad_norm": 0.77734375,
+      "learning_rate": 7.5e-06,
+      "loss": 2.6908,
+      "step": 3
+    },
+    {
+      "epoch": 0.0011311150909605052,
+      "grad_norm": 0.796875,
+      "learning_rate": 1e-05,
+      "loss": 2.8856,
+      "step": 4
+    },
+    {
+      "epoch": 0.0014138938637006315,
+      "grad_norm": 0.78125,
+      "learning_rate": 1.25e-05,
+      "loss": 2.7944,
+      "step": 5
+    },
+    {
+      "epoch": 0.001696672636440758,
+      "grad_norm": 0.76171875,
+      "learning_rate": 1.5e-05,
+      "loss": 2.8203,
+      "step": 6
+    },
+    {
+      "epoch": 0.001979451409180884,
+      "grad_norm": 0.76953125,
+      "learning_rate": 1.7500000000000002e-05,
+      "loss": 2.6714,
+      "step": 7
+    },
+    {
+      "epoch": 0.0022622301819210104,
+      "grad_norm": 0.8203125,
+      "learning_rate": 2e-05,
+      "loss": 2.6746,
+      "step": 8
+    },
+    {
+      "epoch": 0.0025450089546611367,
+      "grad_norm": 0.74609375,
+      "learning_rate": 2.2499999999999998e-05,
+      "loss": 2.7972,
+      "step": 9
+    },
+    {
+      "epoch": 0.002827787727401263,
+      "grad_norm": 0.8046875,
+      "learning_rate": 2.5e-05,
+      "loss": 2.8224,
+      "step": 10
+    },
+    {
+      "epoch": 0.003110566500141389,
+      "grad_norm": 0.80078125,
+      "learning_rate": 2.75e-05,
+      "loss": 2.7473,
+      "step": 11
+    },
+    {
+      "epoch": 0.003393345272881516,
+      "grad_norm": 0.83203125,
+      "learning_rate": 3e-05,
+      "loss": 2.7134,
+      "step": 12
+    },
+    {
+      "epoch": 0.003676124045621642,
+      "grad_norm": 0.81640625,
+      "learning_rate": 3.2500000000000004e-05,
+      "loss": 2.7946,
+      "step": 13
+    },
+    {
+      "epoch": 0.003958902818361768,
+      "grad_norm": 0.73828125,
+      "learning_rate": 3.5000000000000004e-05,
+      "loss": 2.8185,
+      "step": 14
+    },
+    {
+      "epoch": 0.004241681591101895,
+      "grad_norm": 0.76171875,
+      "learning_rate": 3.75e-05,
+      "loss": 2.7015,
+      "step": 15
+    },
+    {
+      "epoch": 0.004524460363842021,
+      "grad_norm": 0.73828125,
+      "learning_rate": 4e-05,
+      "loss": 2.7398,
+      "step": 16
+    },
+    {
+      "epoch": 0.004807239136582147,
+      "grad_norm": 0.74609375,
+      "learning_rate": 4.25e-05,
+      "loss": 2.6954,
+      "step": 17
+    },
+    {
+      "epoch": 0.005090017909322273,
+      "grad_norm": 0.79296875,
+      "learning_rate": 4.4999999999999996e-05,
+      "loss": 2.7737,
+      "step": 18
+    },
+    {
+      "epoch": 0.0053727966820624,
+      "grad_norm": 0.78515625,
+      "learning_rate": 4.75e-05,
+      "loss": 2.7532,
+      "step": 19
+    },
+    {
+      "epoch": 0.005655575454802526,
+      "grad_norm": 0.7734375,
+      "learning_rate": 5e-05,
+      "loss": 2.7254,
+      "step": 20
+    },
+    {
+      "epoch": 0.005938354227542652,
+      "grad_norm": 0.7890625,
+      "learning_rate": 5.25e-05,
+      "loss": 2.8117,
+      "step": 21
+    },
+    {
+      "epoch": 0.006221133000282778,
+      "grad_norm": 0.76171875,
+      "learning_rate": 5.5e-05,
+      "loss": 2.7023,
+      "step": 22
+    },
+    {
+      "epoch": 0.0065039117730229055,
+      "grad_norm": 0.796875,
+      "learning_rate": 5.75e-05,
+      "loss": 2.7226,
+      "step": 23
+    },
+    {
+      "epoch": 0.006786690545763032,
+      "grad_norm": 0.75,
+      "learning_rate": 6e-05,
+      "loss": 2.7699,
+      "step": 24
+    },
+    {
+      "epoch": 0.007069469318503158,
+      "grad_norm": 0.75390625,
+      "learning_rate": 6.25e-05,
+      "loss": 2.8024,
+      "step": 25
+    },
+    {
+      "epoch": 0.007352248091243284,
+      "grad_norm": 0.69921875,
+      "learning_rate": 6.500000000000001e-05,
+      "loss": 2.8438,
+      "step": 26
+    },
+    {
+      "epoch": 0.0076350268639834105,
+      "grad_norm": 0.69921875,
+      "learning_rate": 6.75e-05,
+      "loss": 2.7921,
+      "step": 27
+    },
+    {
+      "epoch": 0.007917805636723537,
+      "grad_norm": 0.69140625,
+      "learning_rate": 7.000000000000001e-05,
+      "loss": 2.6334,
+      "step": 28
+    },
+    {
+      "epoch": 0.008200584409463663,
+      "grad_norm": 0.7421875,
+      "learning_rate": 7.25e-05,
+      "loss": 2.7756,
+      "step": 29
+    },
+    {
+      "epoch": 0.00848336318220379,
+      "grad_norm": 0.7421875,
+      "learning_rate": 7.5e-05,
+      "loss": 2.7423,
+      "step": 30
+    },
+    {
+      "epoch": 0.008766141954943915,
+      "grad_norm": 0.65234375,
+      "learning_rate": 7.75e-05,
+      "loss": 2.8511,
+      "step": 31
+    },
+    {
+      "epoch": 0.009048920727684042,
+      "grad_norm": 0.625,
+      "learning_rate": 8e-05,
+      "loss": 2.7675,
+      "step": 32
+    },
+    {
+      "epoch": 0.009331699500424168,
+      "grad_norm": 0.65234375,
+      "learning_rate": 8.25e-05,
+      "loss": 2.7701,
+      "step": 33
+    },
+    {
+      "epoch": 0.009614478273164294,
+      "grad_norm": 0.609375,
+      "learning_rate": 8.5e-05,
+      "loss": 2.855,
+      "step": 34
+    },
+    {
+      "epoch": 0.00989725704590442,
+      "grad_norm": 0.6328125,
+      "learning_rate": 8.75e-05,
+      "loss": 2.8302,
+      "step": 35
+    },
+    {
+      "epoch": 0.010180035818644547,
+      "grad_norm": 0.59375,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 2.8412,
+      "step": 36
+    },
+    {
+      "epoch": 0.010462814591384673,
+      "grad_norm": 0.609375,
+      "learning_rate": 9.25e-05,
+      "loss": 2.8331,
+      "step": 37
+    },
+    {
+      "epoch": 0.0107455933641248,
+      "grad_norm": 0.61328125,
+      "learning_rate": 9.5e-05,
+      "loss": 2.7374,
+      "step": 38
+    },
+    {
+      "epoch": 0.011028372136864925,
+      "grad_norm": 0.57421875,
+      "learning_rate": 9.750000000000001e-05,
+      "loss": 2.7886,
+      "step": 39
+    },
+    {
+      "epoch": 0.011311150909605052,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0001,
+      "loss": 2.7014,
+      "step": 40
+    },
+    {
+      "epoch": 0.011593929682345178,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0001025,
+      "loss": 2.7952,
+      "step": 41
+    },
+    {
+      "epoch": 0.011876708455085304,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.000105,
+      "loss": 2.7336,
+      "step": 42
+    },
+    {
+      "epoch": 0.01215948722782543,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0001075,
+      "loss": 2.7843,
+      "step": 43
+    },
+    {
+      "epoch": 0.012442266000565557,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.00011,
+      "loss": 2.6144,
+      "step": 44
+    },
+    {
+      "epoch": 0.012725044773305685,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.00011250000000000001,
+      "loss": 2.7719,
+      "step": 45
+    },
+    {
+      "epoch": 0.013007823546045811,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.000115,
+      "loss": 2.7298,
+      "step": 46
+    },
+    {
+      "epoch": 0.013290602318785937,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0001175,
+      "loss": 2.8204,
+      "step": 47
+    },
+    {
+      "epoch": 0.013573381091526063,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.00012,
+      "loss": 2.793,
+      "step": 48
+    },
+    {
+      "epoch": 0.01385615986426619,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0001225,
+      "loss": 2.7587,
+      "step": 49
+    },
+    {
+      "epoch": 0.014138938637006316,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.000125,
+      "loss": 2.6685,
+      "step": 50
+    },
+    {
+      "epoch": 0.014421717409746442,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0001275,
+      "loss": 2.7757,
+      "step": 51
+    },
+    {
+      "epoch": 0.014704496182486568,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 2.7917,
+      "step": 52
+    },
+    {
+      "epoch": 0.014987274955226695,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.00013250000000000002,
+      "loss": 2.7146,
+      "step": 53
+    },
+    {
+      "epoch": 0.015270053727966821,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.000135,
+      "loss": 2.775,
+      "step": 54
+    },
+    {
+      "epoch": 0.015552832500706947,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0001375,
+      "loss": 2.7716,
+      "step": 55
+    },
+    {
+      "epoch": 0.015835611273447073,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00014000000000000001,
+      "loss": 2.7673,
+      "step": 56
+    },
+    {
+      "epoch": 0.0161183900461872,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0001425,
+      "loss": 2.7578,
+      "step": 57
+    },
+    {
+      "epoch": 0.016401168818927326,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.000145,
+      "loss": 2.7354,
+      "step": 58
+    },
+    {
+      "epoch": 0.016683947591667452,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0001475,
+      "loss": 2.8292,
+      "step": 59
+    },
+    {
+      "epoch": 0.01696672636440758,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00015,
+      "loss": 2.7623,
+      "step": 60
+    },
+    {
+      "epoch": 0.017249505137147705,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0001525,
+      "loss": 2.7563,
+      "step": 61
+    },
+    {
+      "epoch": 0.01753228390988783,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.000155,
+      "loss": 2.7535,
+      "step": 62
+    },
+    {
+      "epoch": 0.017815062682627957,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0001575,
+      "loss": 2.7041,
+      "step": 63
+    },
+    {
+      "epoch": 0.018097841455368083,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00016,
+      "loss": 2.7757,
+      "step": 64
+    },
+    {
+      "epoch": 0.01838062022810821,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00016250000000000002,
+      "loss": 2.8202,
+      "step": 65
+    },
+    {
+      "epoch": 0.018663399000848336,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.000165,
+      "loss": 2.7248,
+      "step": 66
+    },
+    {
+      "epoch": 0.018946177773588462,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0001675,
+      "loss": 2.7991,
+      "step": 67
+    },
+    {
+      "epoch": 0.01922895654632859,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00017,
+      "loss": 2.7269,
+      "step": 68
+    },
+    {
+      "epoch": 0.019511735319068715,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0001725,
+      "loss": 2.7502,
+      "step": 69
+    },
+    {
+      "epoch": 0.01979451409180884,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.000175,
+      "loss": 2.7022,
+      "step": 70
+    },
+    {
+      "epoch": 0.020077292864548967,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0001775,
+      "loss": 2.7533,
+      "step": 71
+    },
+    {
+      "epoch": 0.020360071637289093,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 2.8295,
+      "step": 72
+    },
+    {
+      "epoch": 0.02064285041002922,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0001825,
+      "loss": 2.8412,
+      "step": 73
+    },
+    {
+      "epoch": 0.020925629182769346,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.000185,
+      "loss": 2.8365,
+      "step": 74
+    },
+    {
+      "epoch": 0.021208407955509472,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0001875,
+      "loss": 2.7001,
+      "step": 75
+    },
+    {
+      "epoch": 0.0214911867282496,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00019,
+      "loss": 2.7443,
+      "step": 76
+    },
+    {
+      "epoch": 0.021773965500989725,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00019250000000000002,
+      "loss": 2.7344,
+      "step": 77
+    },
+    {
+      "epoch": 0.02205674427372985,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00019500000000000002,
+      "loss": 2.7879,
+      "step": 78
+    },
+    {
+      "epoch": 0.022339523046469977,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0001975,
+      "loss": 2.7279,
+      "step": 79
+    },
+    {
+      "epoch": 0.022622301819210103,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002,
+      "loss": 2.7011,
+      "step": 80
+    },
+    {
+      "epoch": 0.02290508059195023,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00020250000000000002,
+      "loss": 2.8249,
+      "step": 81
+    },
+    {
+      "epoch": 0.023187859364690356,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.000205,
+      "loss": 2.6382,
+      "step": 82
+    },
+    {
+      "epoch": 0.023470638137430482,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002075,
+      "loss": 2.7064,
+      "step": 83
+    },
+    {
+      "epoch": 0.02375341691017061,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00021,
+      "loss": 2.6595,
+      "step": 84
+    },
+    {
+      "epoch": 0.024036195682910735,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0002125,
+      "loss": 2.7486,
+      "step": 85
+    },
+    {
+      "epoch": 0.02431897445565086,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.000215,
+      "loss": 2.7189,
+      "step": 86
+    },
+    {
+      "epoch": 0.024601753228390987,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002175,
+      "loss": 2.641,
+      "step": 87
+    },
+    {
+      "epoch": 0.024884532001131113,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00022,
+      "loss": 2.7538,
+      "step": 88
+    },
+    {
+      "epoch": 0.02516731077387124,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00022250000000000001,
+      "loss": 2.7962,
+      "step": 89
+    },
+    {
+      "epoch": 0.02545008954661137,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00022500000000000002,
+      "loss": 2.6709,
+      "step": 90
+    },
+    {
+      "epoch": 0.025732868319351496,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002275,
+      "loss": 2.7583,
+      "step": 91
+    },
+    {
+      "epoch": 0.026015647092091622,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00023,
+      "loss": 2.7533,
+      "step": 92
+    },
+    {
+      "epoch": 0.026298425864831748,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002325,
+      "loss": 2.7713,
+      "step": 93
+    },
+    {
+      "epoch": 0.026581204637571874,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.000235,
+      "loss": 2.7538,
+      "step": 94
+    },
+    {
+      "epoch": 0.026863983410312,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0002375,
+      "loss": 2.6667,
+      "step": 95
+    },
+    {
+      "epoch": 0.027146762183052127,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00024,
+      "loss": 2.629,
+      "step": 96
+    },
+    {
+      "epoch": 0.027429540955792253,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00024249999999999999,
+      "loss": 2.6954,
+      "step": 97
+    },
+    {
+      "epoch": 0.02771231972853238,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.000245,
+      "loss": 2.6587,
+      "step": 98
+    },
+    {
+      "epoch": 0.027995098501272506,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0002475,
+      "loss": 2.7655,
+      "step": 99
+    },
+    {
+      "epoch": 0.028277877274012632,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00025,
+      "loss": 2.7836,
+      "step": 100
+    },
+    {
+      "epoch": 0.028560656046752758,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002525,
+      "loss": 2.7529,
+      "step": 101
+    },
+    {
+      "epoch": 0.028843434819492884,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.000255,
+      "loss": 2.6644,
+      "step": 102
+    },
+    {
+      "epoch": 0.02912621359223301,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0002575,
+      "loss": 2.705,
+      "step": 103
+    },
+    {
+      "epoch": 0.029408992364973137,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00026000000000000003,
+      "loss": 2.7109,
+      "step": 104
+    },
+    {
+      "epoch": 0.029691771137713263,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00026250000000000004,
+      "loss": 2.7592,
+      "step": 105
+    },
+    {
+      "epoch": 0.02997454991045339,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00026500000000000004,
+      "loss": 2.747,
+      "step": 106
+    },
+    {
+      "epoch": 0.030257328683193516,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0002675,
+      "loss": 2.7356,
+      "step": 107
+    },
+    {
+      "epoch": 0.030540107455933642,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00027,
+      "loss": 2.7167,
+      "step": 108
+    },
+    {
+      "epoch": 0.030822886228673768,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002725,
+      "loss": 2.6863,
+      "step": 109
+    },
+    {
+      "epoch": 0.031105665001413894,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.000275,
+      "loss": 2.6439,
+      "step": 110
+    },
+    {
+      "epoch": 0.03138844377415402,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0002775,
+      "loss": 2.6536,
+      "step": 111
+    },
+    {
+      "epoch": 0.03167122254689415,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00028000000000000003,
+      "loss": 2.7481,
+      "step": 112
+    },
+    {
+      "epoch": 0.03195400131963427,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0002825,
+      "loss": 2.6734,
+      "step": 113
+    },
+    {
+      "epoch": 0.0322367800923744,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.000285,
+      "loss": 2.7453,
+      "step": 114
+    },
+    {
+      "epoch": 0.032519558865114526,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002875,
+      "loss": 2.8891,
+      "step": 115
+    },
+    {
+      "epoch": 0.03280233763785465,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00029,
+      "loss": 2.7235,
+      "step": 116
+    },
+    {
+      "epoch": 0.03308511641059478,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0002925,
+      "loss": 2.6824,
+      "step": 117
+    },
+    {
+      "epoch": 0.033367895183334904,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.000295,
+      "loss": 2.7092,
+      "step": 118
+    },
+    {
+      "epoch": 0.03365067395607503,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00029749999999999997,
+      "loss": 2.7626,
+      "step": 119
+    },
+    {
+      "epoch": 0.03393345272881516,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0003,
+      "loss": 2.6733,
+      "step": 120
+    },
+    {
+      "epoch": 0.03421623150155528,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0003025,
+      "loss": 2.6709,
+      "step": 121
+    },
+    {
+      "epoch": 0.03449901027429541,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.000305,
+      "loss": 2.7311,
+      "step": 122
+    },
+    {
+      "epoch": 0.034781789047035536,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0003075,
+      "loss": 2.6115,
+      "step": 123
+    },
+    {
+      "epoch": 0.03506456781977566,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00031,
+      "loss": 2.7095,
+      "step": 124
+    },
+    {
+      "epoch": 0.03534734659251579,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0003125,
+      "loss": 2.6627,
+      "step": 125
+    },
+    {
+      "epoch": 0.035630125365255914,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.000315,
+      "loss": 2.8735,
+      "step": 126
+    },
+    {
+      "epoch": 0.03591290413799604,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0003175,
+      "loss": 2.7889,
+      "step": 127
+    },
+    {
+      "epoch": 0.03619568291073617,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00032,
+      "loss": 2.6057,
+      "step": 128
+    },
+    {
+      "epoch": 0.03647846168347629,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00032250000000000003,
+      "loss": 2.6387,
+      "step": 129
+    },
+    {
+      "epoch": 0.03676124045621642,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00032500000000000004,
+      "loss": 2.6529,
+      "step": 130
+    },
+    {
+      "epoch": 0.037044019228956546,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00032750000000000005,
+      "loss": 2.795,
+      "step": 131
+    },
+    {
+      "epoch": 0.03732679800169667,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00033,
+      "loss": 2.685,
+      "step": 132
+    },
+    {
+      "epoch": 0.0376095767744368,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0003325,
+      "loss": 2.6728,
+      "step": 133
+    },
+    {
+      "epoch": 0.037892355547176924,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.000335,
+      "loss": 2.6906,
+      "step": 134
+    },
+    {
+      "epoch": 0.03817513431991705,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0003375,
+      "loss": 2.6905,
+      "step": 135
+    },
+    {
+      "epoch": 0.03845791309265718,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00034,
+      "loss": 2.8113,
+      "step": 136
+    },
+    {
+      "epoch": 0.0387406918653973,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00034250000000000003,
+      "loss": 2.7053,
+      "step": 137
+    },
+    {
+      "epoch": 0.03902347063813743,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.000345,
+      "loss": 2.7589,
+      "step": 138
+    },
+    {
+      "epoch": 0.039306249410877556,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0003475,
+      "loss": 2.7462,
+      "step": 139
+    },
+    {
+      "epoch": 0.03958902818361768,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00035,
+      "loss": 2.801,
+      "step": 140
+    },
+    {
+      "epoch": 0.03987180695635781,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0003525,
+      "loss": 2.7696,
+      "step": 141
+    },
+    {
+      "epoch": 0.040154585729097934,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.000355,
+      "loss": 2.7535,
+      "step": 142
+    },
+    {
+      "epoch": 0.04043736450183806,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0003575,
+      "loss": 2.6457,
+      "step": 143
+    },
+    {
+      "epoch": 0.04072014327457819,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00035999999999999997,
+      "loss": 2.6899,
+      "step": 144
+    },
+    {
+      "epoch": 0.04100292204731831,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0003625,
+      "loss": 2.7357,
+      "step": 145
+    },
+    {
+      "epoch": 0.04128570082005844,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.000365,
+      "loss": 2.7567,
+      "step": 146
+    },
+    {
+      "epoch": 0.041568479592798566,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0003675,
+      "loss": 2.7435,
+      "step": 147
+    },
+    {
+      "epoch": 0.04185125836553869,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00037,
+      "loss": 2.719,
+      "step": 148
+    },
+    {
+      "epoch": 0.04213403713827882,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0003725,
+      "loss": 2.7422,
+      "step": 149
+    },
+    {
+      "epoch": 0.042416815911018944,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.000375,
+      "loss": 2.6422,
+      "step": 150
+    },
+    {
+      "epoch": 0.04269959468375907,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0003775,
+      "loss": 2.6201,
+      "step": 151
+    },
+    {
+      "epoch": 0.0429823734564992,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00038,
+      "loss": 2.7534,
+      "step": 152
+    },
+    {
+      "epoch": 0.04326515222923932,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00038250000000000003,
+      "loss": 2.7279,
+      "step": 153
+    },
+    {
+      "epoch": 0.04354793100197945,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00038500000000000003,
+      "loss": 2.6537,
+      "step": 154
+    },
+    {
+      "epoch": 0.043830709774719576,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00038750000000000004,
+      "loss": 2.657,
+      "step": 155
+    },
+    {
+      "epoch": 0.0441134885474597,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00039000000000000005,
+      "loss": 2.6733,
+      "step": 156
+    },
+    {
+      "epoch": 0.04439626732019983,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0003925,
+      "loss": 2.6964,
+      "step": 157
+    },
+    {
+      "epoch": 0.044679046092939954,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.000395,
+      "loss": 2.7566,
+      "step": 158
+    },
+    {
+      "epoch": 0.04496182486568008,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0003975,
+      "loss": 2.729,
+      "step": 159
+    },
+    {
+      "epoch": 0.04524460363842021,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0004,
+      "loss": 2.6093,
+      "step": 160
+    },
+    {
+      "epoch": 0.04552738241116033,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0004025,
+      "loss": 2.7484,
+      "step": 161
+    },
+    {
+      "epoch": 0.04581016118390046,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00040500000000000003,
+      "loss": 2.7072,
+      "step": 162
+    },
+    {
+      "epoch": 0.046092939956640586,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0004075,
+      "loss": 2.7066,
+      "step": 163
+    },
+    {
+      "epoch": 0.04637571872938071,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00041,
+      "loss": 2.6637,
+      "step": 164
+    },
+    {
+      "epoch": 0.04665849750212084,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0004125,
+      "loss": 2.7463,
+      "step": 165
+    },
+    {
+      "epoch": 0.046941276274860964,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.000415,
+      "loss": 2.6977,
+      "step": 166
+    },
+    {
+      "epoch": 0.04722405504760109,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0004175,
+      "loss": 2.6522,
+      "step": 167
+    },
+    {
+      "epoch": 0.04750683382034122,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00042,
+      "loss": 2.728,
+      "step": 168
+    },
+    {
+      "epoch": 0.04778961259308134,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00042249999999999997,
+      "loss": 2.8141,
+      "step": 169
+    },
+    {
+      "epoch": 0.04807239136582147,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.000425,
+      "loss": 2.7782,
+      "step": 170
+    },
+    {
+      "epoch": 0.048355170138561596,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0004275,
+      "loss": 2.6818,
+      "step": 171
+    },
+    {
+      "epoch": 0.04863794891130172,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00043,
+      "loss": 2.7362,
+      "step": 172
+    },
+    {
+      "epoch": 0.04892072768404185,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0004325,
+      "loss": 2.7075,
+      "step": 173
+    },
+    {
+      "epoch": 0.049203506456781974,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.000435,
+      "loss": 2.8147,
+      "step": 174
+    },
+    {
+      "epoch": 0.0494862852295221,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0004375,
+      "loss": 2.5834,
+      "step": 175
+    },
+    {
+      "epoch": 0.04976906400226223,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00044,
+      "loss": 2.5964,
+      "step": 176
+    },
+    {
+      "epoch": 0.05005184277500235,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0004425,
+      "loss": 2.7753,
+      "step": 177
+    },
+    {
+      "epoch": 0.05033462154774248,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00044500000000000003,
+      "loss": 2.5962,
+      "step": 178
+    },
+    {
+      "epoch": 0.05061740032048261,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00044750000000000004,
+      "loss": 2.7144,
+      "step": 179
+    },
+    {
+      "epoch": 0.05090017909322274,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00045000000000000004,
+      "loss": 2.6035,
+      "step": 180
+    },
+    {
+      "epoch": 0.051182957865962865,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00045250000000000005,
+      "loss": 2.7479,
+      "step": 181
+    },
+    {
+      "epoch": 0.05146573663870299,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.000455,
+      "loss": 2.7137,
+      "step": 182
+    },
+    {
+      "epoch": 0.05174851541144312,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0004575,
+      "loss": 2.8017,
+      "step": 183
+    },
+    {
+      "epoch": 0.052031294184183244,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00046,
+      "loss": 2.7057,
+      "step": 184
+    },
+    {
+      "epoch": 0.05231407295692337,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0004625,
+      "loss": 2.6357,
+      "step": 185
+    },
+    {
+      "epoch": 0.052596851729663496,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.000465,
+      "loss": 2.7985,
+      "step": 186
+    },
+    {
+      "epoch": 0.05287963050240362,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00046750000000000003,
+      "loss": 2.7495,
+      "step": 187
+    },
+    {
+      "epoch": 0.05316240927514375,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00047,
+      "loss": 2.8081,
+      "step": 188
+    },
+    {
+      "epoch": 0.053445188047883875,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0004725,
+      "loss": 2.6571,
+      "step": 189
+    },
+    {
+      "epoch": 0.053727966820624,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.000475,
+      "loss": 2.6884,
+      "step": 190
+    },
+    {
+      "epoch": 0.05401074559336413,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0004775,
+      "loss": 2.5996,
+      "step": 191
+    },
+    {
+      "epoch": 0.054293524366104254,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00048,
+      "loss": 2.7077,
+      "step": 192
+    },
+    {
+      "epoch": 0.05457630313884438,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0004825,
+      "loss": 2.5899,
+      "step": 193
+    },
+    {
+      "epoch": 0.054859081911584506,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00048499999999999997,
+      "loss": 2.6607,
+      "step": 194
+    },
+    {
+      "epoch": 0.05514186068432463,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0004875,
+      "loss": 2.6953,
+      "step": 195
+    },
+    {
+      "epoch": 0.05542463945706476,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00049,
+      "loss": 2.7074,
+      "step": 196
+    },
+    {
+      "epoch": 0.055707418229804885,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0004925,
+      "loss": 2.7335,
+      "step": 197
+    },
+    {
+      "epoch": 0.05599019700254501,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.000495,
+      "loss": 2.6951,
+      "step": 198
+    },
+    {
+      "epoch": 0.05627297577528514,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0004975,
+      "loss": 2.6671,
+      "step": 199
+    },
+    {
+      "epoch": 0.056555754548025264,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0005,
+      "loss": 2.6488,
+      "step": 200
+    },
+    {
+      "epoch": 0.05683853332076539,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0005024999999999999,
+      "loss": 2.6969,
+      "step": 201
+    },
+    {
+      "epoch": 0.057121312093505516,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.000505,
+      "loss": 2.7266,
+      "step": 202
+    },
+    {
+      "epoch": 0.05740409086624564,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0005074999999999999,
+      "loss": 2.7185,
+      "step": 203
+    },
+    {
+      "epoch": 0.05768686963898577,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00051,
+      "loss": 2.6981,
+      "step": 204
+    },
+    {
+      "epoch": 0.057969648411725895,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0005124999999999999,
+      "loss": 2.7895,
+      "step": 205
+    },
+    {
+      "epoch": 0.05825242718446602,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.000515,
+      "loss": 2.6501,
+      "step": 206
+    },
+    {
+      "epoch": 0.05853520595720615,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0005175,
+      "loss": 2.7063,
+      "step": 207
+    },
+    {
+      "epoch": 0.058817984729946274,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0005200000000000001,
+      "loss": 2.6494,
+      "step": 208
+    },
+    {
+      "epoch": 0.0591007635026864,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0005225,
+      "loss": 2.7103,
+      "step": 209
+    },
+    {
+      "epoch": 0.059383542275426526,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0005250000000000001,
+      "loss": 2.737,
+      "step": 210
+    },
+    {
+      "epoch": 0.05966632104816665,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0005275,
+      "loss": 2.767,
+      "step": 211
+    },
+    {
+      "epoch": 0.05994909982090678,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0005300000000000001,
+      "loss": 2.7649,
+      "step": 212
+    },
+    {
+      "epoch": 0.060231878593646905,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0005325,
+      "loss": 2.7723,
+      "step": 213
+    },
+    {
+      "epoch": 0.06051465736638703,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.000535,
+      "loss": 2.7049,
+      "step": 214
+    },
+    {
+      "epoch": 0.06079743613912716,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0005375,
+      "loss": 2.718,
+      "step": 215
+    },
+    {
+      "epoch": 0.061080214911867284,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00054,
+      "loss": 2.7322,
+      "step": 216
+    },
+    {
+      "epoch": 0.06136299368460741,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0005425,
+      "loss": 2.8249,
+      "step": 217
+    },
+    {
+      "epoch": 0.061645772457347536,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.000545,
+      "loss": 2.597,
+      "step": 218
+    },
+    {
+      "epoch": 0.06192855123008766,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0005475,
+      "loss": 2.6562,
+      "step": 219
+    },
+    {
+      "epoch": 0.06221133000282779,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00055,
+      "loss": 2.6592,
+      "step": 220
+    },
+    {
+      "epoch": 0.062494108775567915,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0005525,
+      "loss": 2.6218,
+      "step": 221
+    },
+    {
+      "epoch": 0.06277688754830804,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.000555,
+      "loss": 2.7255,
+      "step": 222
+    },
+    {
+      "epoch": 0.06305966632104816,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0005575,
+      "loss": 2.8204,
+      "step": 223
+    },
+    {
+      "epoch": 0.0633424450937883,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0005600000000000001,
+      "loss": 2.7503,
+      "step": 224
+    },
+    {
+      "epoch": 0.06362522386652841,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0005625000000000001,
+      "loss": 2.659,
+      "step": 225
+    },
+    {
+      "epoch": 0.06390800263926855,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.000565,
+      "loss": 2.8152,
+      "step": 226
+    },
+    {
+      "epoch": 0.06419078141200867,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0005675,
+      "loss": 2.7683,
+      "step": 227
+    },
+    {
+      "epoch": 0.0644735601847488,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00057,
+      "loss": 2.6908,
+      "step": 228
+    },
+    {
+      "epoch": 0.06475633895748892,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0005725,
+      "loss": 2.7206,
+      "step": 229
+    },
+    {
+      "epoch": 0.06503911773022905,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.000575,
+      "loss": 2.6713,
+      "step": 230
+    },
+    {
+      "epoch": 0.06532189650296917,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0005775,
+      "loss": 2.5687,
+      "step": 231
+    },
+    {
+      "epoch": 0.0656046752757093,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00058,
+      "loss": 2.7068,
+      "step": 232
+    },
+    {
+      "epoch": 0.06588745404844942,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0005825,
+      "loss": 2.7664,
+      "step": 233
+    },
+    {
+      "epoch": 0.06617023282118956,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.000585,
+      "loss": 2.7705,
+      "step": 234
+    },
+    {
+      "epoch": 0.06645301159392968,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0005875,
+      "loss": 2.6366,
+      "step": 235
+    },
+    {
+      "epoch": 0.06673579036666981,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00059,
+      "loss": 2.6913,
+      "step": 236
+    },
+    {
+      "epoch": 0.06701856913940993,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0005925,
+      "loss": 2.6292,
+      "step": 237
+    },
+    {
+      "epoch": 0.06730134791215006,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0005949999999999999,
+      "loss": 2.6452,
+      "step": 238
+    },
+    {
+      "epoch": 0.0675841266848902,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0005975,
+      "loss": 2.5264,
+      "step": 239
+    },
+    {
+      "epoch": 0.06786690545763031,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0006,
+      "loss": 2.7456,
+      "step": 240
+    },
+    {
+      "epoch": 0.06814968423037045,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0006025000000000001,
+      "loss": 2.6951,
+      "step": 241
+    },
+    {
+      "epoch": 0.06843246300311057,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.000605,
+      "loss": 2.7262,
+      "step": 242
+    },
+    {
+      "epoch": 0.0687152417758507,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0006075000000000001,
+      "loss": 2.7282,
+      "step": 243
+    },
+    {
+      "epoch": 0.06899802054859082,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00061,
+      "loss": 2.7134,
+      "step": 244
+    },
+    {
+      "epoch": 0.06928079932133095,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0006125000000000001,
+      "loss": 2.6794,
+      "step": 245
+    },
+    {
+      "epoch": 0.06956357809407107,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.000615,
+      "loss": 2.7105,
+      "step": 246
+    },
+    {
+      "epoch": 0.0698463568668112,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0006175000000000001,
+      "loss": 2.6656,
+      "step": 247
+    },
+    {
+      "epoch": 0.07012913563955132,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00062,
+      "loss": 2.669,
+      "step": 248
+    },
+    {
+      "epoch": 0.07041191441229146,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0006225000000000001,
+      "loss": 2.7143,
+      "step": 249
+    },
+    {
+      "epoch": 0.07069469318503158,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.000625,
+      "loss": 2.7088,
+      "step": 250
+    },
+    {
+      "epoch": 0.07097747195777171,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0006274999999999999,
+      "loss": 2.6446,
+      "step": 251
+    },
+    {
+      "epoch": 0.07126025073051183,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00063,
+      "loss": 2.6527,
+      "step": 252
+    },
+    {
+      "epoch": 0.07154302950325196,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0006324999999999999,
+      "loss": 2.7882,
+      "step": 253
+    },
+    {
+      "epoch": 0.07182580827599208,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.000635,
+      "loss": 2.711,
+      "step": 254
+    },
+    {
+      "epoch": 0.07210858704873221,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0006374999999999999,
+      "loss": 2.733,
+      "step": 255
+    },
+    {
+      "epoch": 0.07239136582147233,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00064,
+      "loss": 2.5823,
+      "step": 256
+    },
+    {
+      "epoch": 0.07267414459421247,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0006425,
+      "loss": 2.6286,
+      "step": 257
+    },
+    {
+      "epoch": 0.07295692336695259,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0006450000000000001,
+      "loss": 2.6986,
+      "step": 258
+    },
+    {
+      "epoch": 0.07323970213969272,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0006475,
+      "loss": 2.7071,
+      "step": 259
+    },
+    {
+      "epoch": 0.07352248091243284,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0006500000000000001,
+      "loss": 2.6763,
+      "step": 260
+    },
+    {
+      "epoch": 0.07380525968517297,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0006525,
+      "loss": 2.759,
+      "step": 261
+    },
+    {
+      "epoch": 0.07408803845791309,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0006550000000000001,
+      "loss": 2.735,
+      "step": 262
+    },
+    {
+      "epoch": 0.07437081723065322,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0006575,
+      "loss": 2.7129,
+      "step": 263
+    },
+    {
+      "epoch": 0.07465359600339334,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00066,
+      "loss": 2.7521,
+      "step": 264
+    },
+    {
+      "epoch": 0.07493637477613348,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0006625,
+      "loss": 2.6918,
+      "step": 265
+    },
+    {
+      "epoch": 0.0752191535488736,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.000665,
+      "loss": 2.6264,
+      "step": 266
+    },
+    {
+      "epoch": 0.07550193232161373,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0006675,
+      "loss": 2.7115,
+      "step": 267
+    },
+    {
+      "epoch": 0.07578471109435385,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00067,
+      "loss": 2.7494,
+      "step": 268
+    },
+    {
+      "epoch": 0.07606748986709398,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0006725,
+      "loss": 2.6134,
+      "step": 269
+    },
+    {
+      "epoch": 0.0763502686398341,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.000675,
+      "loss": 2.6015,
+      "step": 270
+    },
+    {
+      "epoch": 0.07663304741257423,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0006775,
+      "loss": 2.6864,
+      "step": 271
+    },
+    {
+      "epoch": 0.07691582618531435,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00068,
+      "loss": 2.6564,
+      "step": 272
+    },
+    {
+      "epoch": 0.07719860495805449,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0006825000000000001,
+      "loss": 2.6775,
+      "step": 273
+    },
+    {
+      "epoch": 0.0774813837307946,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0006850000000000001,
+      "loss": 2.6788,
+      "step": 274
+    },
+    {
+      "epoch": 0.07776416250353474,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0006875,
+      "loss": 2.6712,
+      "step": 275
+    },
+    {
+      "epoch": 0.07804694127627486,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00069,
+      "loss": 2.7014,
+      "step": 276
+    },
+    {
+      "epoch": 0.07832972004901499,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0006925,
+      "loss": 2.6744,
+      "step": 277
+    },
+    {
+      "epoch": 0.07861249882175511,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.000695,
+      "loss": 2.7591,
+      "step": 278
+    },
+    {
+      "epoch": 0.07889527759449524,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0006975,
+      "loss": 2.7315,
+      "step": 279
+    },
+    {
+      "epoch": 0.07917805636723536,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0007,
+      "loss": 2.6955,
+      "step": 280
+    },
+    {
+      "epoch": 0.0794608351399755,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0007025,
+      "loss": 2.6846,
+      "step": 281
+    },
+    {
+      "epoch": 0.07974361391271562,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.000705,
+      "loss": 2.5973,
+      "step": 282
+    },
+    {
+      "epoch": 0.08002639268545575,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0007075,
+      "loss": 2.671,
+      "step": 283
+    },
+    {
+      "epoch": 0.08030917145819587,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00071,
+      "loss": 2.7698,
+      "step": 284
+    },
+    {
+      "epoch": 0.080591950230936,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0007125,
+      "loss": 2.6023,
+      "step": 285
+    },
+    {
+      "epoch": 0.08087472900367612,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.000715,
+      "loss": 2.5578,
+      "step": 286
+    },
+    {
+      "epoch": 0.08115750777641625,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0007175,
+      "loss": 2.7082,
+      "step": 287
+    },
+    {
+      "epoch": 0.08144028654915637,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0007199999999999999,
+      "loss": 2.6585,
+      "step": 288
+    },
+    {
+      "epoch": 0.08172306532189651,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0007225,
+      "loss": 2.6401,
+      "step": 289
+    },
+    {
+      "epoch": 0.08200584409463663,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.000725,
+      "loss": 2.6788,
+      "step": 290
+    },
+    {
+      "epoch": 0.08228862286737676,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0007275000000000001,
+      "loss": 2.681,
+      "step": 291
+    },
+    {
+      "epoch": 0.08257140164011688,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00073,
+      "loss": 2.7276,
+      "step": 292
+    },
+    {
+      "epoch": 0.08285418041285701,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0007325000000000001,
+      "loss": 2.6946,
+      "step": 293
+    },
+    {
+      "epoch": 0.08313695918559713,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.000735,
+      "loss": 2.6882,
+      "step": 294
+    },
+    {
+      "epoch": 0.08341973795833726,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0007375000000000001,
+      "loss": 2.6571,
+      "step": 295
+    },
+    {
+      "epoch": 0.08370251673107738,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00074,
+      "loss": 2.7716,
+      "step": 296
+    },
+    {
+      "epoch": 0.08398529550381752,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0007425000000000001,
+      "loss": 2.7189,
+      "step": 297
+    },
+    {
+      "epoch": 0.08426807427655764,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.000745,
+      "loss": 2.6868,
+      "step": 298
+    },
+    {
+      "epoch": 0.08455085304929777,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0007475000000000001,
+      "loss": 2.6705,
+      "step": 299
+    },
+    {
+      "epoch": 0.08483363182203789,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00075,
+      "loss": 2.7242,
+      "step": 300
+    },
+    {
+      "epoch": 0.08483363182203789,
+      "eval_loss": 1.8609898090362549,
+      "eval_runtime": 74.934,
+      "eval_samples_per_second": 13.665,
+      "eval_steps_per_second": 0.427,
+      "step": 300
+    },
+    {
+      "epoch": 0.08483363182203789,
+      "eval/hellaswag_acc": 0.37572196773551086,
+      "eval/hellaswag_acc_norm": 0.4715196176060546,
+      "eval_hellaswag_elapsed_time": 331.94735646247864,
+      "step": 300
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 3536,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 300,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.0466845500899328e+18,
+  "train_batch_size": 36,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8013b814259602ed6ff882614208872ab7a8454b694cebafcd4203b1c1554ba8
+size 7992

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff