Upload task output 1280f28a-02cf-40d7-a301-7678a6e5aafd

Browse files

Files changed (13) hide show

.gitattributes +1 -0
added_tokens.json +6 -0
config.json +30 -0
generation_config.json +8 -0
loss.txt +1 -0
merges.txt +0 -0
model.safetensors +3 -0
special_tokens_map.json +20 -0
tokenizer.json +3 -0
tokenizer_config.json +53 -0
trainer_state.json +2727 -0
training_args.bin +3 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "<|PAD_TOKEN|>": 151646,
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151646,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "unsloth_version": "2024.9",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "max_length": 131072,
+  "max_new_tokens": 2048,
+  "pad_token_id": 151646,
+  "transformers_version": "4.51.3"
+}

loss.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 1896,0.28488120436668396

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9596c70f607b3effd76ae3260f45213a2126e8481a9e5635b16179a053ae7009
+size 3087467144

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|PAD_TOKEN|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9479047c22047670077878de944c696032b1a81049beeb6e99fbda7fb93e395a
+size 11418456

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,53 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|PAD_TOKEN|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "pad_token": "<|PAD_TOKEN|>",
+  "padding_side": "left",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2727 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.9978925184404637,
+  "eval_steps": 500,
+  "global_step": 1896,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.005268703898840885,
+      "grad_norm": 1.8046875,
+      "learning_rate": 9.142857142857142e-06,
+      "loss": 0.5469,
+      "step": 5
+    },
+    {
+      "epoch": 0.01053740779768177,
+      "grad_norm": 1.3203125,
+      "learning_rate": 2.0571428571428566e-05,
+      "loss": 0.493,
+      "step": 10
+    },
+    {
+      "epoch": 0.015806111696522657,
+      "grad_norm": 0.73828125,
+      "learning_rate": 3.2e-05,
+      "loss": 0.4305,
+      "step": 15
+    },
+    {
+      "epoch": 0.02107481559536354,
+      "grad_norm": 0.65625,
+      "learning_rate": 4.3428571428571424e-05,
+      "loss": 0.4097,
+      "step": 20
+    },
+    {
+      "epoch": 0.026343519494204427,
+      "grad_norm": 0.58984375,
+      "learning_rate": 5.485714285714285e-05,
+      "loss": 0.3943,
+      "step": 25
+    },
+    {
+      "epoch": 0.03161222339304531,
+      "grad_norm": 0.60546875,
+      "learning_rate": 6.628571428571428e-05,
+      "loss": 0.4013,
+      "step": 30
+    },
+    {
+      "epoch": 0.0368809272918862,
+      "grad_norm": 0.640625,
+      "learning_rate": 7.771428571428571e-05,
+      "loss": 0.387,
+      "step": 35
+    },
+    {
+      "epoch": 0.04214963119072708,
+      "grad_norm": 0.6171875,
+      "learning_rate": 7.999970044289027e-05,
+      "loss": 0.3758,
+      "step": 40
+    },
+    {
+      "epoch": 0.04741833508956796,
+      "grad_norm": 0.609375,
+      "learning_rate": 7.999848350238486e-05,
+      "loss": 0.3781,
+      "step": 45
+    },
+    {
+      "epoch": 0.05268703898840885,
+      "grad_norm": 0.5546875,
+      "learning_rate": 7.999633049410842e-05,
+      "loss": 0.3766,
+      "step": 50
+    },
+    {
+      "epoch": 0.05795574288724974,
+      "grad_norm": 0.578125,
+      "learning_rate": 7.999324148524308e-05,
+      "loss": 0.3749,
+      "step": 55
+    },
+    {
+      "epoch": 0.06322444678609063,
+      "grad_norm": 0.5625,
+      "learning_rate": 7.998921657217774e-05,
+      "loss": 0.3719,
+      "step": 60
+    },
+    {
+      "epoch": 0.0684931506849315,
+      "grad_norm": 0.57421875,
+      "learning_rate": 7.998425588050514e-05,
+      "loss": 0.3694,
+      "step": 65
+    },
+    {
+      "epoch": 0.0737618545837724,
+      "grad_norm": 0.57421875,
+      "learning_rate": 7.99783595650179e-05,
+      "loss": 0.3679,
+      "step": 70
+    },
+    {
+      "epoch": 0.07903055848261328,
+      "grad_norm": 0.546875,
+      "learning_rate": 7.997152780970364e-05,
+      "loss": 0.3641,
+      "step": 75
+    },
+    {
+      "epoch": 0.08429926238145416,
+      "grad_norm": 0.625,
+      "learning_rate": 7.99637608277394e-05,
+      "loss": 0.3585,
+      "step": 80
+    },
+    {
+      "epoch": 0.08956796628029505,
+      "grad_norm": 0.55859375,
+      "learning_rate": 7.99550588614848e-05,
+      "loss": 0.3562,
+      "step": 85
+    },
+    {
+      "epoch": 0.09483667017913593,
+      "grad_norm": 0.50390625,
+      "learning_rate": 7.994542218247453e-05,
+      "loss": 0.3496,
+      "step": 90
+    },
+    {
+      "epoch": 0.10010537407797682,
+      "grad_norm": 0.56640625,
+      "learning_rate": 7.993485109140998e-05,
+      "loss": 0.3585,
+      "step": 95
+    },
+    {
+      "epoch": 0.1053740779768177,
+      "grad_norm": 0.55078125,
+      "learning_rate": 7.992334591814973e-05,
+      "loss": 0.3568,
+      "step": 100
+    },
+    {
+      "epoch": 0.11064278187565858,
+      "grad_norm": 0.53515625,
+      "learning_rate": 7.991090702169934e-05,
+      "loss": 0.349,
+      "step": 105
+    },
+    {
+      "epoch": 0.11591148577449947,
+      "grad_norm": 0.5234375,
+      "learning_rate": 7.989753479020009e-05,
+      "loss": 0.345,
+      "step": 110
+    },
+    {
+      "epoch": 0.12118018967334036,
+      "grad_norm": 0.5234375,
+      "learning_rate": 7.98832296409169e-05,
+      "loss": 0.3518,
+      "step": 115
+    },
+    {
+      "epoch": 0.12644889357218125,
+      "grad_norm": 0.5703125,
+      "learning_rate": 7.986799202022531e-05,
+      "loss": 0.357,
+      "step": 120
+    },
+    {
+      "epoch": 0.13171759747102213,
+      "grad_norm": 0.6328125,
+      "learning_rate": 7.985182240359757e-05,
+      "loss": 0.3563,
+      "step": 125
+    },
+    {
+      "epoch": 0.136986301369863,
+      "grad_norm": 0.515625,
+      "learning_rate": 7.983472129558769e-05,
+      "loss": 0.3465,
+      "step": 130
+    },
+    {
+      "epoch": 0.1422550052687039,
+      "grad_norm": 0.53515625,
+      "learning_rate": 7.98166892298159e-05,
+      "loss": 0.3397,
+      "step": 135
+    },
+    {
+      "epoch": 0.1475237091675448,
+      "grad_norm": 0.5390625,
+      "learning_rate": 7.979772676895182e-05,
+      "loss": 0.3541,
+      "step": 140
+    },
+    {
+      "epoch": 0.15279241306638566,
+      "grad_norm": 0.55859375,
+      "learning_rate": 7.977783450469697e-05,
+      "loss": 0.3431,
+      "step": 145
+    },
+    {
+      "epoch": 0.15806111696522657,
+      "grad_norm": 0.51171875,
+      "learning_rate": 7.97570130577663e-05,
+      "loss": 0.3507,
+      "step": 150
+    },
+    {
+      "epoch": 0.16332982086406744,
+      "grad_norm": 0.53515625,
+      "learning_rate": 7.973526307786885e-05,
+      "loss": 0.3416,
+      "step": 155
+    },
+    {
+      "epoch": 0.16859852476290832,
+      "grad_norm": 0.48828125,
+      "learning_rate": 7.971258524368743e-05,
+      "loss": 0.3459,
+      "step": 160
+    },
+    {
+      "epoch": 0.17386722866174922,
+      "grad_norm": 0.54296875,
+      "learning_rate": 7.968898026285744e-05,
+      "loss": 0.332,
+      "step": 165
+    },
+    {
+      "epoch": 0.1791359325605901,
+      "grad_norm": 0.52734375,
+      "learning_rate": 7.966444887194489e-05,
+      "loss": 0.3496,
+      "step": 170
+    },
+    {
+      "epoch": 0.18440463645943098,
+      "grad_norm": 0.515625,
+      "learning_rate": 7.963899183642324e-05,
+      "loss": 0.3409,
+      "step": 175
+    },
+    {
+      "epoch": 0.18967334035827185,
+      "grad_norm": 0.58984375,
+      "learning_rate": 7.961260995064969e-05,
+      "loss": 0.3473,
+      "step": 180
+    },
+    {
+      "epoch": 0.19494204425711276,
+      "grad_norm": 0.57421875,
+      "learning_rate": 7.958530403784029e-05,
+      "loss": 0.3389,
+      "step": 185
+    },
+    {
+      "epoch": 0.20021074815595363,
+      "grad_norm": 0.51171875,
+      "learning_rate": 7.955707495004427e-05,
+      "loss": 0.3317,
+      "step": 190
+    },
+    {
+      "epoch": 0.2054794520547945,
+      "grad_norm": 0.52734375,
+      "learning_rate": 7.952792356811745e-05,
+      "loss": 0.3424,
+      "step": 195
+    },
+    {
+      "epoch": 0.2107481559536354,
+      "grad_norm": 0.5234375,
+      "learning_rate": 7.949785080169479e-05,
+      "loss": 0.3423,
+      "step": 200
+    },
+    {
+      "epoch": 0.2160168598524763,
+      "grad_norm": 0.490234375,
+      "learning_rate": 7.946685758916198e-05,
+      "loss": 0.338,
+      "step": 205
+    },
+    {
+      "epoch": 0.22128556375131717,
+      "grad_norm": 0.5234375,
+      "learning_rate": 7.943494489762617e-05,
+      "loss": 0.3256,
+      "step": 210
+    },
+    {
+      "epoch": 0.22655426765015807,
+      "grad_norm": 0.515625,
+      "learning_rate": 7.940211372288572e-05,
+      "loss": 0.3237,
+      "step": 215
+    },
+    {
+      "epoch": 0.23182297154899895,
+      "grad_norm": 0.54296875,
+      "learning_rate": 7.936836508939928e-05,
+      "loss": 0.3334,
+      "step": 220
+    },
+    {
+      "epoch": 0.23709167544783982,
+      "grad_norm": 0.53125,
+      "learning_rate": 7.933370005025367e-05,
+      "loss": 0.3455,
+      "step": 225
+    },
+    {
+      "epoch": 0.24236037934668073,
+      "grad_norm": 0.53125,
+      "learning_rate": 7.92981196871311e-05,
+      "loss": 0.3303,
+      "step": 230
+    },
+    {
+      "epoch": 0.2476290832455216,
+      "grad_norm": 0.486328125,
+      "learning_rate": 7.926162511027539e-05,
+      "loss": 0.3389,
+      "step": 235
+    },
+    {
+      "epoch": 0.2528977871443625,
+      "grad_norm": 0.5,
+      "learning_rate": 7.922421745845734e-05,
+      "loss": 0.3347,
+      "step": 240
+    },
+    {
+      "epoch": 0.2581664910432034,
+      "grad_norm": 0.494140625,
+      "learning_rate": 7.918589789893922e-05,
+      "loss": 0.3215,
+      "step": 245
+    },
+    {
+      "epoch": 0.26343519494204426,
+      "grad_norm": 0.50390625,
+      "learning_rate": 7.914666762743831e-05,
+      "loss": 0.3234,
+      "step": 250
+    },
+    {
+      "epoch": 0.26870389884088514,
+      "grad_norm": 0.482421875,
+      "learning_rate": 7.910652786808953e-05,
+      "loss": 0.3211,
+      "step": 255
+    },
+    {
+      "epoch": 0.273972602739726,
+      "grad_norm": 0.4765625,
+      "learning_rate": 7.90654798734074e-05,
+      "loss": 0.3332,
+      "step": 260
+    },
+    {
+      "epoch": 0.2792413066385669,
+      "grad_norm": 0.486328125,
+      "learning_rate": 7.902352492424682e-05,
+      "loss": 0.3352,
+      "step": 265
+    },
+    {
+      "epoch": 0.2845100105374078,
+      "grad_norm": 0.484375,
+      "learning_rate": 7.898066432976318e-05,
+      "loss": 0.3403,
+      "step": 270
+    },
+    {
+      "epoch": 0.2897787144362487,
+      "grad_norm": 0.49609375,
+      "learning_rate": 7.893689942737141e-05,
+      "loss": 0.3351,
+      "step": 275
+    },
+    {
+      "epoch": 0.2950474183350896,
+      "grad_norm": 0.53515625,
+      "learning_rate": 7.88922315827044e-05,
+      "loss": 0.3254,
+      "step": 280
+    },
+    {
+      "epoch": 0.30031612223393045,
+      "grad_norm": 0.490234375,
+      "learning_rate": 7.884666218957029e-05,
+      "loss": 0.336,
+      "step": 285
+    },
+    {
+      "epoch": 0.3055848261327713,
+      "grad_norm": 0.47265625,
+      "learning_rate": 7.880019266990891e-05,
+      "loss": 0.318,
+      "step": 290
+    },
+    {
+      "epoch": 0.3108535300316122,
+      "grad_norm": 0.515625,
+      "learning_rate": 7.875282447374757e-05,
+      "loss": 0.3328,
+      "step": 295
+    },
+    {
+      "epoch": 0.31612223393045313,
+      "grad_norm": 0.5,
+      "learning_rate": 7.870455907915573e-05,
+      "loss": 0.3268,
+      "step": 300
+    },
+    {
+      "epoch": 0.321390937829294,
+      "grad_norm": 0.50390625,
+      "learning_rate": 7.865539799219885e-05,
+      "loss": 0.3188,
+      "step": 305
+    },
+    {
+      "epoch": 0.3266596417281349,
+      "grad_norm": 0.51171875,
+      "learning_rate": 7.860534274689147e-05,
+      "loss": 0.3286,
+      "step": 310
+    },
+    {
+      "epoch": 0.33192834562697576,
+      "grad_norm": 0.4609375,
+      "learning_rate": 7.855439490514922e-05,
+      "loss": 0.3259,
+      "step": 315
+    },
+    {
+      "epoch": 0.33719704952581664,
+      "grad_norm": 0.5,
+      "learning_rate": 7.850255605674026e-05,
+      "loss": 0.3213,
+      "step": 320
+    },
+    {
+      "epoch": 0.3424657534246575,
+      "grad_norm": 0.494140625,
+      "learning_rate": 7.844982781923554e-05,
+      "loss": 0.3194,
+      "step": 325
+    },
+    {
+      "epoch": 0.34773445732349845,
+      "grad_norm": 0.5390625,
+      "learning_rate": 7.839621183795833e-05,
+      "loss": 0.3258,
+      "step": 330
+    },
+    {
+      "epoch": 0.3530031612223393,
+      "grad_norm": 0.482421875,
+      "learning_rate": 7.834170978593296e-05,
+      "loss": 0.3232,
+      "step": 335
+    },
+    {
+      "epoch": 0.3582718651211802,
+      "grad_norm": 0.51953125,
+      "learning_rate": 7.828632336383253e-05,
+      "loss": 0.3232,
+      "step": 340
+    },
+    {
+      "epoch": 0.3635405690200211,
+      "grad_norm": 0.4921875,
+      "learning_rate": 7.823005429992587e-05,
+      "loss": 0.3198,
+      "step": 345
+    },
+    {
+      "epoch": 0.36880927291886195,
+      "grad_norm": 0.4375,
+      "learning_rate": 7.81729043500237e-05,
+      "loss": 0.3222,
+      "step": 350
+    },
+    {
+      "epoch": 0.37407797681770283,
+      "grad_norm": 0.478515625,
+      "learning_rate": 7.811487529742366e-05,
+      "loss": 0.3269,
+      "step": 355
+    },
+    {
+      "epoch": 0.3793466807165437,
+      "grad_norm": 0.4765625,
+      "learning_rate": 7.805596895285485e-05,
+      "loss": 0.3221,
+      "step": 360
+    },
+    {
+      "epoch": 0.38461538461538464,
+      "grad_norm": 0.4921875,
+      "learning_rate": 7.799618715442116e-05,
+      "loss": 0.3178,
+      "step": 365
+    },
+    {
+      "epoch": 0.3898840885142255,
+      "grad_norm": 0.484375,
+      "learning_rate": 7.79355317675441e-05,
+      "loss": 0.3173,
+      "step": 370
+    },
+    {
+      "epoch": 0.3951527924130664,
+      "grad_norm": 0.47265625,
+      "learning_rate": 7.78740046849044e-05,
+      "loss": 0.3206,
+      "step": 375
+    },
+    {
+      "epoch": 0.40042149631190727,
+      "grad_norm": 0.455078125,
+      "learning_rate": 7.781160782638307e-05,
+      "loss": 0.3141,
+      "step": 380
+    },
+    {
+      "epoch": 0.40569020021074814,
+      "grad_norm": 0.490234375,
+      "learning_rate": 7.774834313900154e-05,
+      "loss": 0.3107,
+      "step": 385
+    },
+    {
+      "epoch": 0.410958904109589,
+      "grad_norm": 0.474609375,
+      "learning_rate": 7.76842125968607e-05,
+      "loss": 0.3315,
+      "step": 390
+    },
+    {
+      "epoch": 0.41622760800842995,
+      "grad_norm": 0.474609375,
+      "learning_rate": 7.761921820107951e-05,
+      "loss": 0.315,
+      "step": 395
+    },
+    {
+      "epoch": 0.4214963119072708,
+      "grad_norm": 0.484375,
+      "learning_rate": 7.755336197973248e-05,
+      "loss": 0.311,
+      "step": 400
+    },
+    {
+      "epoch": 0.4267650158061117,
+      "grad_norm": 0.490234375,
+      "learning_rate": 7.748664598778633e-05,
+      "loss": 0.3257,
+      "step": 405
+    },
+    {
+      "epoch": 0.4320337197049526,
+      "grad_norm": 0.474609375,
+      "learning_rate": 7.7419072307036e-05,
+      "loss": 0.3256,
+      "step": 410
+    },
+    {
+      "epoch": 0.43730242360379346,
+      "grad_norm": 0.484375,
+      "learning_rate": 7.735064304603954e-05,
+      "loss": 0.3226,
+      "step": 415
+    },
+    {
+      "epoch": 0.44257112750263433,
+      "grad_norm": 0.458984375,
+      "learning_rate": 7.728136034005241e-05,
+      "loss": 0.3197,
+      "step": 420
+    },
+    {
+      "epoch": 0.44783983140147526,
+      "grad_norm": 0.458984375,
+      "learning_rate": 7.721122635096086e-05,
+      "loss": 0.3207,
+      "step": 425
+    },
+    {
+      "epoch": 0.45310853530031614,
+      "grad_norm": 0.462890625,
+      "learning_rate": 7.714024326721441e-05,
+      "loss": 0.3092,
+      "step": 430
+    },
+    {
+      "epoch": 0.458377239199157,
+      "grad_norm": 0.44140625,
+      "learning_rate": 7.706841330375755e-05,
+      "loss": 0.3104,
+      "step": 435
+    },
+    {
+      "epoch": 0.4636459430979979,
+      "grad_norm": 0.470703125,
+      "learning_rate": 7.699573870196074e-05,
+      "loss": 0.3036,
+      "step": 440
+    },
+    {
+      "epoch": 0.46891464699683877,
+      "grad_norm": 0.453125,
+      "learning_rate": 7.692222172955035e-05,
+      "loss": 0.3061,
+      "step": 445
+    },
+    {
+      "epoch": 0.47418335089567965,
+      "grad_norm": 0.474609375,
+      "learning_rate": 7.684786468053799e-05,
+      "loss": 0.3148,
+      "step": 450
+    },
+    {
+      "epoch": 0.4794520547945205,
+      "grad_norm": 0.466796875,
+      "learning_rate": 7.677266987514882e-05,
+      "loss": 0.3157,
+      "step": 455
+    },
+    {
+      "epoch": 0.48472075869336145,
+      "grad_norm": 0.4453125,
+      "learning_rate": 7.669663965974923e-05,
+      "loss": 0.3147,
+      "step": 460
+    },
+    {
+      "epoch": 0.48998946259220233,
+      "grad_norm": 0.44140625,
+      "learning_rate": 7.661977640677366e-05,
+      "loss": 0.3053,
+      "step": 465
+    },
+    {
+      "epoch": 0.4952581664910432,
+      "grad_norm": 0.431640625,
+      "learning_rate": 7.654208251465047e-05,
+      "loss": 0.3111,
+      "step": 470
+    },
+    {
+      "epoch": 0.5005268703898841,
+      "grad_norm": 0.4609375,
+      "learning_rate": 7.646356040772716e-05,
+      "loss": 0.3195,
+      "step": 475
+    },
+    {
+      "epoch": 0.505795574288725,
+      "grad_norm": 0.455078125,
+      "learning_rate": 7.638421253619466e-05,
+      "loss": 0.3026,
+      "step": 480
+    },
+    {
+      "epoch": 0.5110642781875658,
+      "grad_norm": 0.443359375,
+      "learning_rate": 7.630404137601104e-05,
+      "loss": 0.3004,
+      "step": 485
+    },
+    {
+      "epoch": 0.5163329820864068,
+      "grad_norm": 0.48828125,
+      "learning_rate": 7.622304942882402e-05,
+      "loss": 0.3181,
+      "step": 490
+    },
+    {
+      "epoch": 0.5216016859852476,
+      "grad_norm": 0.474609375,
+      "learning_rate": 7.61412392218931e-05,
+      "loss": 0.3061,
+      "step": 495
+    },
+    {
+      "epoch": 0.5268703898840885,
+      "grad_norm": 0.455078125,
+      "learning_rate": 7.605861330801056e-05,
+      "loss": 0.3244,
+      "step": 500
+    },
+    {
+      "epoch": 0.5268703898840885,
+      "eval_loss": 0.3330249488353729,
+      "eval_runtime": 2.6362,
+      "eval_samples_per_second": 17.829,
+      "eval_steps_per_second": 17.829,
+      "step": 500
+    },
+    {
+      "epoch": 0.5321390937829295,
+      "grad_norm": 0.466796875,
+      "learning_rate": 7.597517426542193e-05,
+      "loss": 0.3059,
+      "step": 505
+    },
+    {
+      "epoch": 0.5374077976817703,
+      "grad_norm": 0.458984375,
+      "learning_rate": 7.589092469774541e-05,
+      "loss": 0.307,
+      "step": 510
+    },
+    {
+      "epoch": 0.5426765015806112,
+      "grad_norm": 0.435546875,
+      "learning_rate": 7.580586723389075e-05,
+      "loss": 0.3093,
+      "step": 515
+    },
+    {
+      "epoch": 0.547945205479452,
+      "grad_norm": 0.447265625,
+      "learning_rate": 7.572000452797713e-05,
+      "loss": 0.3079,
+      "step": 520
+    },
+    {
+      "epoch": 0.553213909378293,
+      "grad_norm": 0.453125,
+      "learning_rate": 7.563333925925036e-05,
+      "loss": 0.3077,
+      "step": 525
+    },
+    {
+      "epoch": 0.5584826132771338,
+      "grad_norm": 0.484375,
+      "learning_rate": 7.554587413199932e-05,
+      "loss": 0.3033,
+      "step": 530
+    },
+    {
+      "epoch": 0.5637513171759747,
+      "grad_norm": 0.423828125,
+      "learning_rate": 7.545761187547155e-05,
+      "loss": 0.3131,
+      "step": 535
+    },
+    {
+      "epoch": 0.5690200210748156,
+      "grad_norm": 0.421875,
+      "learning_rate": 7.536855524378804e-05,
+      "loss": 0.3053,
+      "step": 540
+    },
+    {
+      "epoch": 0.5742887249736565,
+      "grad_norm": 0.44921875,
+      "learning_rate": 7.527870701585735e-05,
+      "loss": 0.3038,
+      "step": 545
+    },
+    {
+      "epoch": 0.5795574288724974,
+      "grad_norm": 0.455078125,
+      "learning_rate": 7.518806999528887e-05,
+      "loss": 0.3086,
+      "step": 550
+    },
+    {
+      "epoch": 0.5848261327713382,
+      "grad_norm": 0.455078125,
+      "learning_rate": 7.50966470103054e-05,
+      "loss": 0.3003,
+      "step": 555
+    },
+    {
+      "epoch": 0.5900948366701791,
+      "grad_norm": 0.466796875,
+      "learning_rate": 7.500444091365479e-05,
+      "loss": 0.3078,
+      "step": 560
+    },
+    {
+      "epoch": 0.59536354056902,
+      "grad_norm": 0.44140625,
+      "learning_rate": 7.491145458252099e-05,
+      "loss": 0.3034,
+      "step": 565
+    },
+    {
+      "epoch": 0.6006322444678609,
+      "grad_norm": 0.47265625,
+      "learning_rate": 7.481769091843424e-05,
+      "loss": 0.3054,
+      "step": 570
+    },
+    {
+      "epoch": 0.6059009483667018,
+      "grad_norm": 0.44140625,
+      "learning_rate": 7.472315284718061e-05,
+      "loss": 0.2977,
+      "step": 575
+    },
+    {
+      "epoch": 0.6111696522655427,
+      "grad_norm": 0.427734375,
+      "learning_rate": 7.46278433187106e-05,
+      "loss": 0.2976,
+      "step": 580
+    },
+    {
+      "epoch": 0.6164383561643836,
+      "grad_norm": 0.4375,
+      "learning_rate": 7.453176530704713e-05,
+      "loss": 0.3064,
+      "step": 585
+    },
+    {
+      "epoch": 0.6217070600632244,
+      "grad_norm": 0.484375,
+      "learning_rate": 7.443492181019277e-05,
+      "loss": 0.2976,
+      "step": 590
+    },
+    {
+      "epoch": 0.6269757639620653,
+      "grad_norm": 0.455078125,
+      "learning_rate": 7.433731585003612e-05,
+      "loss": 0.2993,
+      "step": 595
+    },
+    {
+      "epoch": 0.6322444678609063,
+      "grad_norm": 0.4375,
+      "learning_rate": 7.423895047225762e-05,
+      "loss": 0.2986,
+      "step": 600
+    },
+    {
+      "epoch": 0.6375131717597471,
+      "grad_norm": 0.44140625,
+      "learning_rate": 7.413982874623443e-05,
+      "loss": 0.2996,
+      "step": 605
+    },
+    {
+      "epoch": 0.642781875658588,
+      "grad_norm": 0.462890625,
+      "learning_rate": 7.403995376494465e-05,
+      "loss": 0.3115,
+      "step": 610
+    },
+    {
+      "epoch": 0.6480505795574288,
+      "grad_norm": 0.455078125,
+      "learning_rate": 7.393932864487086e-05,
+      "loss": 0.3031,
+      "step": 615
+    },
+    {
+      "epoch": 0.6533192834562698,
+      "grad_norm": 0.462890625,
+      "learning_rate": 7.383795652590285e-05,
+      "loss": 0.3078,
+      "step": 620
+    },
+    {
+      "epoch": 0.6585879873551106,
+      "grad_norm": 0.455078125,
+      "learning_rate": 7.373584057123965e-05,
+      "loss": 0.2977,
+      "step": 625
+    },
+    {
+      "epoch": 0.6638566912539515,
+      "grad_norm": 0.41796875,
+      "learning_rate": 7.363298396729077e-05,
+      "loss": 0.2922,
+      "step": 630
+    },
+    {
+      "epoch": 0.6691253951527925,
+      "grad_norm": 0.4296875,
+      "learning_rate": 7.352938992357685e-05,
+      "loss": 0.2964,
+      "step": 635
+    },
+    {
+      "epoch": 0.6743940990516333,
+      "grad_norm": 0.44140625,
+      "learning_rate": 7.342506167262951e-05,
+      "loss": 0.3015,
+      "step": 640
+    },
+    {
+      "epoch": 0.6796628029504742,
+      "grad_norm": 0.443359375,
+      "learning_rate": 7.33200024698904e-05,
+      "loss": 0.3019,
+      "step": 645
+    },
+    {
+      "epoch": 0.684931506849315,
+      "grad_norm": 0.478515625,
+      "learning_rate": 7.32142155936097e-05,
+      "loss": 0.3052,
+      "step": 650
+    },
+    {
+      "epoch": 0.690200210748156,
+      "grad_norm": 0.439453125,
+      "learning_rate": 7.310770434474381e-05,
+      "loss": 0.2968,
+      "step": 655
+    },
+    {
+      "epoch": 0.6954689146469969,
+      "grad_norm": 0.439453125,
+      "learning_rate": 7.300047204685228e-05,
+      "loss": 0.3029,
+      "step": 660
+    },
+    {
+      "epoch": 0.7007376185458377,
+      "grad_norm": 0.421875,
+      "learning_rate": 7.28925220459942e-05,
+      "loss": 0.3056,
+      "step": 665
+    },
+    {
+      "epoch": 0.7060063224446786,
+      "grad_norm": 0.451171875,
+      "learning_rate": 7.278385771062373e-05,
+      "loss": 0.3053,
+      "step": 670
+    },
+    {
+      "epoch": 0.7112750263435195,
+      "grad_norm": 0.44921875,
+      "learning_rate": 7.267448243148501e-05,
+      "loss": 0.2986,
+      "step": 675
+    },
+    {
+      "epoch": 0.7165437302423604,
+      "grad_norm": 0.462890625,
+      "learning_rate": 7.256439962150638e-05,
+      "loss": 0.2891,
+      "step": 680
+    },
+    {
+      "epoch": 0.7218124341412012,
+      "grad_norm": 0.416015625,
+      "learning_rate": 7.245361271569382e-05,
+      "loss": 0.2968,
+      "step": 685
+    },
+    {
+      "epoch": 0.7270811380400422,
+      "grad_norm": 0.455078125,
+      "learning_rate": 7.234212517102378e-05,
+      "loss": 0.3014,
+      "step": 690
+    },
+    {
+      "epoch": 0.7323498419388831,
+      "grad_norm": 0.48828125,
+      "learning_rate": 7.222994046633541e-05,
+      "loss": 0.3031,
+      "step": 695
+    },
+    {
+      "epoch": 0.7376185458377239,
+      "grad_norm": 0.451171875,
+      "learning_rate": 7.211706210222186e-05,
+      "loss": 0.3031,
+      "step": 700
+    },
+    {
+      "epoch": 0.7428872497365648,
+      "grad_norm": 0.421875,
+      "learning_rate": 7.200349360092113e-05,
+      "loss": 0.2889,
+      "step": 705
+    },
+    {
+      "epoch": 0.7481559536354057,
+      "grad_norm": 0.427734375,
+      "learning_rate": 7.188923850620616e-05,
+      "loss": 0.2839,
+      "step": 710
+    },
+    {
+      "epoch": 0.7534246575342466,
+      "grad_norm": 0.435546875,
+      "learning_rate": 7.177430038327424e-05,
+      "loss": 0.3007,
+      "step": 715
+    },
+    {
+      "epoch": 0.7586933614330874,
+      "grad_norm": 0.431640625,
+      "learning_rate": 7.165868281863572e-05,
+      "loss": 0.2841,
+      "step": 720
+    },
+    {
+      "epoch": 0.7639620653319283,
+      "grad_norm": 0.427734375,
+      "learning_rate": 7.15423894200022e-05,
+      "loss": 0.2979,
+      "step": 725
+    },
+    {
+      "epoch": 0.7692307692307693,
+      "grad_norm": 0.435546875,
+      "learning_rate": 7.142542381617388e-05,
+      "loss": 0.2918,
+      "step": 730
+    },
+    {
+      "epoch": 0.7744994731296101,
+      "grad_norm": 0.44140625,
+      "learning_rate": 7.130778965692629e-05,
+      "loss": 0.3029,
+      "step": 735
+    },
+    {
+      "epoch": 0.779768177028451,
+      "grad_norm": 0.453125,
+      "learning_rate": 7.118949061289649e-05,
+      "loss": 0.3013,
+      "step": 740
+    },
+    {
+      "epoch": 0.7850368809272918,
+      "grad_norm": 0.43359375,
+      "learning_rate": 7.107053037546851e-05,
+      "loss": 0.3047,
+      "step": 745
+    },
+    {
+      "epoch": 0.7903055848261328,
+      "grad_norm": 0.439453125,
+      "learning_rate": 7.095091265665814e-05,
+      "loss": 0.2979,
+      "step": 750
+    },
+    {
+      "epoch": 0.7955742887249737,
+      "grad_norm": 0.466796875,
+      "learning_rate": 7.083064118899708e-05,
+      "loss": 0.293,
+      "step": 755
+    },
+    {
+      "epoch": 0.8008429926238145,
+      "grad_norm": 0.431640625,
+      "learning_rate": 7.070971972541654e-05,
+      "loss": 0.2998,
+      "step": 760
+    },
+    {
+      "epoch": 0.8061116965226555,
+      "grad_norm": 0.427734375,
+      "learning_rate": 7.05881520391301e-05,
+      "loss": 0.2864,
+      "step": 765
+    },
+    {
+      "epoch": 0.8113804004214963,
+      "grad_norm": 0.42578125,
+      "learning_rate": 7.046594192351595e-05,
+      "loss": 0.2927,
+      "step": 770
+    },
+    {
+      "epoch": 0.8166491043203372,
+      "grad_norm": 0.4453125,
+      "learning_rate": 7.034309319199853e-05,
+      "loss": 0.294,
+      "step": 775
+    },
+    {
+      "epoch": 0.821917808219178,
+      "grad_norm": 0.4375,
+      "learning_rate": 7.021960967792956e-05,
+      "loss": 0.2912,
+      "step": 780
+    },
+    {
+      "epoch": 0.827186512118019,
+      "grad_norm": 0.435546875,
+      "learning_rate": 7.009549523446842e-05,
+      "loss": 0.292,
+      "step": 785
+    },
+    {
+      "epoch": 0.8324552160168599,
+      "grad_norm": 0.439453125,
+      "learning_rate": 6.997075373446187e-05,
+      "loss": 0.3041,
+      "step": 790
+    },
+    {
+      "epoch": 0.8377239199157007,
+      "grad_norm": 0.435546875,
+      "learning_rate": 6.98453890703233e-05,
+      "loss": 0.2996,
+      "step": 795
+    },
+    {
+      "epoch": 0.8429926238145417,
+      "grad_norm": 0.443359375,
+      "learning_rate": 6.971940515391118e-05,
+      "loss": 0.2953,
+      "step": 800
+    },
+    {
+      "epoch": 0.8482613277133825,
+      "grad_norm": 0.43359375,
+      "learning_rate": 6.9592805916407e-05,
+      "loss": 0.2954,
+      "step": 805
+    },
+    {
+      "epoch": 0.8535300316122234,
+      "grad_norm": 0.447265625,
+      "learning_rate": 6.946559530819265e-05,
+      "loss": 0.2937,
+      "step": 810
+    },
+    {
+      "epoch": 0.8587987355110642,
+      "grad_norm": 0.44921875,
+      "learning_rate": 6.933777729872716e-05,
+      "loss": 0.3035,
+      "step": 815
+    },
+    {
+      "epoch": 0.8640674394099052,
+      "grad_norm": 0.435546875,
+      "learning_rate": 6.920935587642278e-05,
+      "loss": 0.2913,
+      "step": 820
+    },
+    {
+      "epoch": 0.8693361433087461,
+      "grad_norm": 0.435546875,
+      "learning_rate": 6.908033504852054e-05,
+      "loss": 0.2961,
+      "step": 825
+    },
+    {
+      "epoch": 0.8746048472075869,
+      "grad_norm": 0.43359375,
+      "learning_rate": 6.895071884096526e-05,
+      "loss": 0.2846,
+      "step": 830
+    },
+    {
+      "epoch": 0.8798735511064278,
+      "grad_norm": 0.423828125,
+      "learning_rate": 6.882051129827989e-05,
+      "loss": 0.288,
+      "step": 835
+    },
+    {
+      "epoch": 0.8851422550052687,
+      "grad_norm": 0.4453125,
+      "learning_rate": 6.868971648343925e-05,
+      "loss": 0.2927,
+      "step": 840
+    },
+    {
+      "epoch": 0.8904109589041096,
+      "grad_norm": 0.423828125,
+      "learning_rate": 6.855833847774337e-05,
+      "loss": 0.286,
+      "step": 845
+    },
+    {
+      "epoch": 0.8956796628029505,
+      "grad_norm": 0.447265625,
+      "learning_rate": 6.842638138069003e-05,
+      "loss": 0.2863,
+      "step": 850
+    },
+    {
+      "epoch": 0.9009483667017913,
+      "grad_norm": 0.42578125,
+      "learning_rate": 6.82938493098469e-05,
+      "loss": 0.2901,
+      "step": 855
+    },
+    {
+      "epoch": 0.9062170706006323,
+      "grad_norm": 0.439453125,
+      "learning_rate": 6.816074640072305e-05,
+      "loss": 0.2884,
+      "step": 860
+    },
+    {
+      "epoch": 0.9114857744994731,
+      "grad_norm": 0.4375,
+      "learning_rate": 6.802707680663987e-05,
+      "loss": 0.2903,
+      "step": 865
+    },
+    {
+      "epoch": 0.916754478398314,
+      "grad_norm": 0.43359375,
+      "learning_rate": 6.789284469860146e-05,
+      "loss": 0.2945,
+      "step": 870
+    },
+    {
+      "epoch": 0.9220231822971549,
+      "grad_norm": 0.44140625,
+      "learning_rate": 6.775805426516464e-05,
+      "loss": 0.2864,
+      "step": 875
+    },
+    {
+      "epoch": 0.9272918861959958,
+      "grad_norm": 0.439453125,
+      "learning_rate": 6.762270971230799e-05,
+      "loss": 0.287,
+      "step": 880
+    },
+    {
+      "epoch": 0.9325605900948367,
+      "grad_norm": 0.44140625,
+      "learning_rate": 6.748681526330078e-05,
+      "loss": 0.2936,
+      "step": 885
+    },
+    {
+      "epoch": 0.9378292939936775,
+      "grad_norm": 0.4375,
+      "learning_rate": 6.73503751585712e-05,
+      "loss": 0.2907,
+      "step": 890
+    },
+    {
+      "epoch": 0.9430979978925185,
+      "grad_norm": 0.43359375,
+      "learning_rate": 6.721339365557394e-05,
+      "loss": 0.2864,
+      "step": 895
+    },
+    {
+      "epoch": 0.9483667017913593,
+      "grad_norm": 0.419921875,
+      "learning_rate": 6.707587502865739e-05,
+      "loss": 0.2948,
+      "step": 900
+    },
+    {
+      "epoch": 0.9536354056902002,
+      "grad_norm": 0.4375,
+      "learning_rate": 6.693782356893032e-05,
+      "loss": 0.2962,
+      "step": 905
+    },
+    {
+      "epoch": 0.958904109589041,
+      "grad_norm": 0.439453125,
+      "learning_rate": 6.679924358412785e-05,
+      "loss": 0.2898,
+      "step": 910
+    },
+    {
+      "epoch": 0.964172813487882,
+      "grad_norm": 0.451171875,
+      "learning_rate": 6.666013939847719e-05,
+      "loss": 0.2833,
+      "step": 915
+    },
+    {
+      "epoch": 0.9694415173867229,
+      "grad_norm": 0.44140625,
+      "learning_rate": 6.652051535256257e-05,
+      "loss": 0.2995,
+      "step": 920
+    },
+    {
+      "epoch": 0.9747102212855637,
+      "grad_norm": 0.44140625,
+      "learning_rate": 6.638037580318988e-05,
+      "loss": 0.2863,
+      "step": 925
+    },
+    {
+      "epoch": 0.9799789251844047,
+      "grad_norm": 0.478515625,
+      "learning_rate": 6.623972512325068e-05,
+      "loss": 0.294,
+      "step": 930
+    },
+    {
+      "epoch": 0.9852476290832455,
+      "grad_norm": 0.41015625,
+      "learning_rate": 6.609856770158579e-05,
+      "loss": 0.2806,
+      "step": 935
+    },
+    {
+      "epoch": 0.9905163329820864,
+      "grad_norm": 0.421875,
+      "learning_rate": 6.595690794284828e-05,
+      "loss": 0.2901,
+      "step": 940
+    },
+    {
+      "epoch": 0.9957850368809273,
+      "grad_norm": 0.400390625,
+      "learning_rate": 6.581475026736611e-05,
+      "loss": 0.2799,
+      "step": 945
+    },
+    {
+      "epoch": 0.9989462592202318,
+      "eval_loss": 0.3042255640029907,
+      "eval_runtime": 2.5158,
+      "eval_samples_per_second": 18.682,
+      "eval_steps_per_second": 18.682,
+      "step": 948
+    },
+    {
+      "epoch": 1.0010537407797682,
+      "grad_norm": 0.59765625,
+      "learning_rate": 6.56720991110041e-05,
+      "loss": 0.2765,
+      "step": 950
+    },
+    {
+      "epoch": 1.006322444678609,
+      "grad_norm": 0.51171875,
+      "learning_rate": 6.552895892502563e-05,
+      "loss": 0.2285,
+      "step": 955
+    },
+    {
+      "epoch": 1.01159114857745,
+      "grad_norm": 0.416015625,
+      "learning_rate": 6.538533417595359e-05,
+      "loss": 0.2271,
+      "step": 960
+    },
+    {
+      "epoch": 1.0168598524762908,
+      "grad_norm": 0.451171875,
+      "learning_rate": 6.52412293454312e-05,
+      "loss": 0.2292,
+      "step": 965
+    },
+    {
+      "epoch": 1.0221285563751317,
+      "grad_norm": 0.44140625,
+      "learning_rate": 6.5096648930082e-05,
+      "loss": 0.2196,
+      "step": 970
+    },
+    {
+      "epoch": 1.0273972602739727,
+      "grad_norm": 0.439453125,
+      "learning_rate": 6.495159744136959e-05,
+      "loss": 0.2249,
+      "step": 975
+    },
+    {
+      "epoch": 1.0326659641728135,
+      "grad_norm": 0.431640625,
+      "learning_rate": 6.480607940545692e-05,
+      "loss": 0.2212,
+      "step": 980
+    },
+    {
+      "epoch": 1.0379346680716544,
+      "grad_norm": 0.44921875,
+      "learning_rate": 6.466009936306498e-05,
+      "loss": 0.2272,
+      "step": 985
+    },
+    {
+      "epoch": 1.0432033719704952,
+      "grad_norm": 0.408203125,
+      "learning_rate": 6.45136618693311e-05,
+      "loss": 0.2253,
+      "step": 990
+    },
+    {
+      "epoch": 1.0484720758693362,
+      "grad_norm": 0.447265625,
+      "learning_rate": 6.436677149366688e-05,
+      "loss": 0.229,
+      "step": 995
+    },
+    {
+      "epoch": 1.053740779768177,
+      "grad_norm": 0.427734375,
+      "learning_rate": 6.42194328196156e-05,
+      "loss": 0.2252,
+      "step": 1000
+    },
+    {
+      "epoch": 1.053740779768177,
+      "eval_loss": 0.30919456481933594,
+      "eval_runtime": 2.4891,
+      "eval_samples_per_second": 18.882,
+      "eval_steps_per_second": 18.882,
+      "step": 1000
+    },
+    {
+      "epoch": 1.0590094836670179,
+      "grad_norm": 0.494140625,
+      "learning_rate": 6.407165044470911e-05,
+      "loss": 0.2241,
+      "step": 1005
+    },
+    {
+      "epoch": 1.064278187565859,
+      "grad_norm": 0.4375,
+      "learning_rate": 6.392342898032445e-05,
+      "loss": 0.2296,
+      "step": 1010
+    },
+    {
+      "epoch": 1.0695468914646997,
+      "grad_norm": 0.478515625,
+      "learning_rate": 6.377477305153997e-05,
+      "loss": 0.2257,
+      "step": 1015
+    },
+    {
+      "epoch": 1.0748155953635405,
+      "grad_norm": 0.478515625,
+      "learning_rate": 6.362568729699093e-05,
+      "loss": 0.2269,
+      "step": 1020
+    },
+    {
+      "epoch": 1.0800842992623814,
+      "grad_norm": 0.48828125,
+      "learning_rate": 6.347617636872484e-05,
+      "loss": 0.2314,
+      "step": 1025
+    },
+    {
+      "epoch": 1.0853530031612224,
+      "grad_norm": 0.439453125,
+      "learning_rate": 6.332624493205623e-05,
+      "loss": 0.2261,
+      "step": 1030
+    },
+    {
+      "epoch": 1.0906217070600632,
+      "grad_norm": 0.45703125,
+      "learning_rate": 6.317589766542111e-05,
+      "loss": 0.2239,
+      "step": 1035
+    },
+    {
+      "epoch": 1.095890410958904,
+      "grad_norm": 0.431640625,
+      "learning_rate": 6.3025139260231e-05,
+      "loss": 0.2234,
+      "step": 1040
+    },
+    {
+      "epoch": 1.101159114857745,
+      "grad_norm": 0.4453125,
+      "learning_rate": 6.287397442072653e-05,
+      "loss": 0.2342,
+      "step": 1045
+    },
+    {
+      "epoch": 1.106427818756586,
+      "grad_norm": 0.443359375,
+      "learning_rate": 6.272240786383057e-05,
+      "loss": 0.2307,
+      "step": 1050
+    },
+    {
+      "epoch": 1.1116965226554267,
+      "grad_norm": 0.443359375,
+      "learning_rate": 6.257044431900121e-05,
+      "loss": 0.2376,
+      "step": 1055
+    },
+    {
+      "epoch": 1.1169652265542676,
+      "grad_norm": 0.4375,
+      "learning_rate": 6.241808852808403e-05,
+      "loss": 0.2319,
+      "step": 1060
+    },
+    {
+      "epoch": 1.1222339304531086,
+      "grad_norm": 0.439453125,
+      "learning_rate": 6.226534524516418e-05,
+      "loss": 0.2285,
+      "step": 1065
+    },
+    {
+      "epoch": 1.1275026343519494,
+      "grad_norm": 0.427734375,
+      "learning_rate": 6.21122192364181e-05,
+      "loss": 0.2254,
+      "step": 1070
+    },
+    {
+      "epoch": 1.1327713382507902,
+      "grad_norm": 0.4453125,
+      "learning_rate": 6.195871527996467e-05,
+      "loss": 0.2325,
+      "step": 1075
+    },
+    {
+      "epoch": 1.1380400421496313,
+      "grad_norm": 0.421875,
+      "learning_rate": 6.180483816571628e-05,
+      "loss": 0.2333,
+      "step": 1080
+    },
+    {
+      "epoch": 1.143308746048472,
+      "grad_norm": 0.41796875,
+      "learning_rate": 6.165059269522921e-05,
+      "loss": 0.2162,
+      "step": 1085
+    },
+    {
+      "epoch": 1.148577449947313,
+      "grad_norm": 0.44140625,
+      "learning_rate": 6.149598368155386e-05,
+      "loss": 0.2269,
+      "step": 1090
+    },
+    {
+      "epoch": 1.1538461538461537,
+      "grad_norm": 0.419921875,
+      "learning_rate": 6.13410159490846e-05,
+      "loss": 0.2304,
+      "step": 1095
+    },
+    {
+      "epoch": 1.1591148577449948,
+      "grad_norm": 0.48046875,
+      "learning_rate": 6.118569433340927e-05,
+      "loss": 0.2246,
+      "step": 1100
+    },
+    {
+      "epoch": 1.1643835616438356,
+      "grad_norm": 0.427734375,
+      "learning_rate": 6.103002368115805e-05,
+      "loss": 0.219,
+      "step": 1105
+    },
+    {
+      "epoch": 1.1696522655426764,
+      "grad_norm": 0.43359375,
+      "learning_rate": 6.0874008849852566e-05,
+      "loss": 0.2299,
+      "step": 1110
+    },
+    {
+      "epoch": 1.1749209694415175,
+      "grad_norm": 0.4296875,
+      "learning_rate": 6.071765470775406e-05,
+      "loss": 0.2222,
+      "step": 1115
+    },
+    {
+      "epoch": 1.1801896733403583,
+      "grad_norm": 0.423828125,
+      "learning_rate": 6.056096613371163e-05,
+      "loss": 0.224,
+      "step": 1120
+    },
+    {
+      "epoch": 1.1854583772391991,
+      "grad_norm": 0.458984375,
+      "learning_rate": 6.040394801700989e-05,
+      "loss": 0.2249,
+      "step": 1125
+    },
+    {
+      "epoch": 1.1907270811380402,
+      "grad_norm": 0.4375,
+      "learning_rate": 6.024660525721645e-05,
+      "loss": 0.2188,
+      "step": 1130
+    },
+    {
+      "epoch": 1.195995785036881,
+      "grad_norm": 0.423828125,
+      "learning_rate": 6.008894276402905e-05,
+      "loss": 0.2212,
+      "step": 1135
+    },
+    {
+      "epoch": 1.2012644889357218,
+      "grad_norm": 0.4375,
+      "learning_rate": 5.993096545712233e-05,
+      "loss": 0.2234,
+      "step": 1140
+    },
+    {
+      "epoch": 1.2065331928345626,
+      "grad_norm": 0.42578125,
+      "learning_rate": 5.977267826599435e-05,
+      "loss": 0.2186,
+      "step": 1145
+    },
+    {
+      "epoch": 1.2118018967334037,
+      "grad_norm": 0.4453125,
+      "learning_rate": 5.9614086129812696e-05,
+      "loss": 0.2269,
+      "step": 1150
+    },
+    {
+      "epoch": 1.2170706006322445,
+      "grad_norm": 0.4609375,
+      "learning_rate": 5.945519399726045e-05,
+      "loss": 0.2257,
+      "step": 1155
+    },
+    {
+      "epoch": 1.2223393045310853,
+      "grad_norm": 0.443359375,
+      "learning_rate": 5.929600682638171e-05,
+      "loss": 0.2274,
+      "step": 1160
+    },
+    {
+      "epoch": 1.2276080084299261,
+      "grad_norm": 0.4296875,
+      "learning_rate": 5.913652958442693e-05,
+      "loss": 0.2169,
+      "step": 1165
+    },
+    {
+      "epoch": 1.2328767123287672,
+      "grad_norm": 0.44921875,
+      "learning_rate": 5.8976767247697856e-05,
+      "loss": 0.2267,
+      "step": 1170
+    },
+    {
+      "epoch": 1.238145416227608,
+      "grad_norm": 0.4296875,
+      "learning_rate": 5.88167248013923e-05,
+      "loss": 0.2205,
+      "step": 1175
+    },
+    {
+      "epoch": 1.2434141201264488,
+      "grad_norm": 0.453125,
+      "learning_rate": 5.865640723944859e-05,
+      "loss": 0.2256,
+      "step": 1180
+    },
+    {
+      "epoch": 1.2486828240252899,
+      "grad_norm": 0.44140625,
+      "learning_rate": 5.849581956438969e-05,
+      "loss": 0.2294,
+      "step": 1185
+    },
+    {
+      "epoch": 1.2539515279241307,
+      "grad_norm": 0.470703125,
+      "learning_rate": 5.8334966787167135e-05,
+      "loss": 0.2247,
+      "step": 1190
+    },
+    {
+      "epoch": 1.2592202318229715,
+      "grad_norm": 0.419921875,
+      "learning_rate": 5.8173853927004676e-05,
+      "loss": 0.2187,
+      "step": 1195
+    },
+    {
+      "epoch": 1.2644889357218125,
+      "grad_norm": 0.4453125,
+      "learning_rate": 5.801248601124164e-05,
+      "loss": 0.2276,
+      "step": 1200
+    },
+    {
+      "epoch": 1.2697576396206534,
+      "grad_norm": 0.42578125,
+      "learning_rate": 5.7850868075176056e-05,
+      "loss": 0.235,
+      "step": 1205
+    },
+    {
+      "epoch": 1.2750263435194942,
+      "grad_norm": 0.416015625,
+      "learning_rate": 5.7689005161907586e-05,
+      "loss": 0.2252,
+      "step": 1210
+    },
+    {
+      "epoch": 1.2802950474183352,
+      "grad_norm": 0.4296875,
+      "learning_rate": 5.752690232218005e-05,
+      "loss": 0.2244,
+      "step": 1215
+    },
+    {
+      "epoch": 1.285563751317176,
+      "grad_norm": 0.44921875,
+      "learning_rate": 5.7364564614223974e-05,
+      "loss": 0.2224,
+      "step": 1220
+    },
+    {
+      "epoch": 1.2908324552160169,
+      "grad_norm": 0.44140625,
+      "learning_rate": 5.720199710359862e-05,
+      "loss": 0.2329,
+      "step": 1225
+    },
+    {
+      "epoch": 1.2961011591148577,
+      "grad_norm": 0.419921875,
+      "learning_rate": 5.703920486303399e-05,
+      "loss": 0.2193,
+      "step": 1230
+    },
+    {
+      "epoch": 1.3013698630136985,
+      "grad_norm": 0.44140625,
+      "learning_rate": 5.6876192972272516e-05,
+      "loss": 0.2306,
+      "step": 1235
+    },
+    {
+      "epoch": 1.3066385669125395,
+      "grad_norm": 0.443359375,
+      "learning_rate": 5.6712966517910595e-05,
+      "loss": 0.2199,
+      "step": 1240
+    },
+    {
+      "epoch": 1.3119072708113804,
+      "grad_norm": 0.42578125,
+      "learning_rate": 5.654953059323978e-05,
+      "loss": 0.2217,
+      "step": 1245
+    },
+    {
+      "epoch": 1.3171759747102212,
+      "grad_norm": 0.42578125,
+      "learning_rate": 5.638589029808793e-05,
+      "loss": 0.2288,
+      "step": 1250
+    },
+    {
+      "epoch": 1.3224446786090622,
+      "grad_norm": 0.45703125,
+      "learning_rate": 5.6222050738660043e-05,
+      "loss": 0.2265,
+      "step": 1255
+    },
+    {
+      "epoch": 1.327713382507903,
+      "grad_norm": 0.431640625,
+      "learning_rate": 5.605801702737892e-05,
+      "loss": 0.2341,
+      "step": 1260
+    },
+    {
+      "epoch": 1.3329820864067439,
+      "grad_norm": 0.453125,
+      "learning_rate": 5.5893794282725646e-05,
+      "loss": 0.23,
+      "step": 1265
+    },
+    {
+      "epoch": 1.338250790305585,
+      "grad_norm": 0.4453125,
+      "learning_rate": 5.5729387629079884e-05,
+      "loss": 0.2156,
+      "step": 1270
+    },
+    {
+      "epoch": 1.3435194942044257,
+      "grad_norm": 0.4609375,
+      "learning_rate": 5.556480219655995e-05,
+      "loss": 0.2279,
+      "step": 1275
+    },
+    {
+      "epoch": 1.3487881981032666,
+      "grad_norm": 0.4296875,
+      "learning_rate": 5.540004312086276e-05,
+      "loss": 0.2238,
+      "step": 1280
+    },
+    {
+      "epoch": 1.3540569020021076,
+      "grad_norm": 0.474609375,
+      "learning_rate": 5.523511554310354e-05,
+      "loss": 0.234,
+      "step": 1285
+    },
+    {
+      "epoch": 1.3593256059009484,
+      "grad_norm": 0.43359375,
+      "learning_rate": 5.507002460965547e-05,
+      "loss": 0.2243,
+      "step": 1290
+    },
+    {
+      "epoch": 1.3645943097997892,
+      "grad_norm": 0.43359375,
+      "learning_rate": 5.490477547198899e-05,
+      "loss": 0.2217,
+      "step": 1295
+    },
+    {
+      "epoch": 1.36986301369863,
+      "grad_norm": 0.455078125,
+      "learning_rate": 5.47393732865112e-05,
+      "loss": 0.2297,
+      "step": 1300
+    },
+    {
+      "epoch": 1.375131717597471,
+      "grad_norm": 0.421875,
+      "learning_rate": 5.457382321440477e-05,
+      "loss": 0.2282,
+      "step": 1305
+    },
+    {
+      "epoch": 1.380400421496312,
+      "grad_norm": 0.44921875,
+      "learning_rate": 5.4408130421467115e-05,
+      "loss": 0.2275,
+      "step": 1310
+    },
+    {
+      "epoch": 1.3856691253951527,
+      "grad_norm": 0.43359375,
+      "learning_rate": 5.424230007794903e-05,
+      "loss": 0.2246,
+      "step": 1315
+    },
+    {
+      "epoch": 1.3909378292939936,
+      "grad_norm": 0.486328125,
+      "learning_rate": 5.40763373583934e-05,
+      "loss": 0.2187,
+      "step": 1320
+    },
+    {
+      "epoch": 1.3962065331928346,
+      "grad_norm": 0.45703125,
+      "learning_rate": 5.391024744147379e-05,
+      "loss": 0.2226,
+      "step": 1325
+    },
+    {
+      "epoch": 1.4014752370916754,
+      "grad_norm": 0.455078125,
+      "learning_rate": 5.374403550983279e-05,
+      "loss": 0.2213,
+      "step": 1330
+    },
+    {
+      "epoch": 1.4067439409905163,
+      "grad_norm": 0.455078125,
+      "learning_rate": 5.357770674992032e-05,
+      "loss": 0.226,
+      "step": 1335
+    },
+    {
+      "epoch": 1.4120126448893573,
+      "grad_norm": 0.42578125,
+      "learning_rate": 5.341126635183178e-05,
+      "loss": 0.2262,
+      "step": 1340
+    },
+    {
+      "epoch": 1.4172813487881981,
+      "grad_norm": 0.44921875,
+      "learning_rate": 5.324471950914613e-05,
+      "loss": 0.2298,
+      "step": 1345
+    },
+    {
+      "epoch": 1.422550052687039,
+      "grad_norm": 0.4453125,
+      "learning_rate": 5.30780714187638e-05,
+      "loss": 0.2197,
+      "step": 1350
+    },
+    {
+      "epoch": 1.42781875658588,
+      "grad_norm": 0.421875,
+      "learning_rate": 5.291132728074453e-05,
+      "loss": 0.2231,
+      "step": 1355
+    },
+    {
+      "epoch": 1.4330874604847208,
+      "grad_norm": 0.44140625,
+      "learning_rate": 5.2744492298145136e-05,
+      "loss": 0.2251,
+      "step": 1360
+    },
+    {
+      "epoch": 1.4383561643835616,
+      "grad_norm": 0.453125,
+      "learning_rate": 5.25775716768571e-05,
+      "loss": 0.2252,
+      "step": 1365
+    },
+    {
+      "epoch": 1.4436248682824027,
+      "grad_norm": 0.42578125,
+      "learning_rate": 5.24105706254442e-05,
+      "loss": 0.2238,
+      "step": 1370
+    },
+    {
+      "epoch": 1.4488935721812435,
+      "grad_norm": 0.4140625,
+      "learning_rate": 5.224349435497989e-05,
+      "loss": 0.2259,
+      "step": 1375
+    },
+    {
+      "epoch": 1.4541622760800843,
+      "grad_norm": 0.44140625,
+      "learning_rate": 5.207634807888481e-05,
+      "loss": 0.2172,
+      "step": 1380
+    },
+    {
+      "epoch": 1.4594309799789251,
+      "grad_norm": 0.43359375,
+      "learning_rate": 5.190913701276398e-05,
+      "loss": 0.2133,
+      "step": 1385
+    },
+    {
+      "epoch": 1.464699683877766,
+      "grad_norm": 0.43359375,
+      "learning_rate": 5.1741866374244174e-05,
+      "loss": 0.2195,
+      "step": 1390
+    },
+    {
+      "epoch": 1.469968387776607,
+      "grad_norm": 0.4296875,
+      "learning_rate": 5.157454138281102e-05,
+      "loss": 0.2223,
+      "step": 1395
+    },
+    {
+      "epoch": 1.4752370916754478,
+      "grad_norm": 0.431640625,
+      "learning_rate": 5.140716725964618e-05,
+      "loss": 0.2163,
+      "step": 1400
+    },
+    {
+      "epoch": 1.4805057955742886,
+      "grad_norm": 0.4140625,
+      "learning_rate": 5.1239749227464393e-05,
+      "loss": 0.2264,
+      "step": 1405
+    },
+    {
+      "epoch": 1.4857744994731297,
+      "grad_norm": 0.416015625,
+      "learning_rate": 5.107229251035056e-05,
+      "loss": 0.2168,
+      "step": 1410
+    },
+    {
+      "epoch": 1.4910432033719705,
+      "grad_norm": 0.4140625,
+      "learning_rate": 5.090480233359667e-05,
+      "loss": 0.2221,
+      "step": 1415
+    },
+    {
+      "epoch": 1.4963119072708113,
+      "grad_norm": 0.435546875,
+      "learning_rate": 5.07372839235388e-05,
+      "loss": 0.2257,
+      "step": 1420
+    },
+    {
+      "epoch": 1.5015806111696524,
+      "grad_norm": 0.443359375,
+      "learning_rate": 5.056974250739401e-05,
+      "loss": 0.2241,
+      "step": 1425
+    },
+    {
+      "epoch": 1.5068493150684932,
+      "grad_norm": 0.451171875,
+      "learning_rate": 5.0402183313097235e-05,
+      "loss": 0.2253,
+      "step": 1430
+    },
+    {
+      "epoch": 1.512118018967334,
+      "grad_norm": 0.4375,
+      "learning_rate": 5.023461156913818e-05,
+      "loss": 0.2195,
+      "step": 1435
+    },
+    {
+      "epoch": 1.517386722866175,
+      "grad_norm": 0.43359375,
+      "learning_rate": 5.0067032504398086e-05,
+      "loss": 0.2172,
+      "step": 1440
+    },
+    {
+      "epoch": 1.5226554267650156,
+      "grad_norm": 0.423828125,
+      "learning_rate": 4.98994513479867e-05,
+      "loss": 0.2243,
+      "step": 1445
+    },
+    {
+      "epoch": 1.5279241306638567,
+      "grad_norm": 0.423828125,
+      "learning_rate": 4.9731873329079e-05,
+      "loss": 0.224,
+      "step": 1450
+    },
+    {
+      "epoch": 1.5331928345626977,
+      "grad_norm": 0.404296875,
+      "learning_rate": 4.9564303676752075e-05,
+      "loss": 0.2187,
+      "step": 1455
+    },
+    {
+      "epoch": 1.5384615384615383,
+      "grad_norm": 0.44140625,
+      "learning_rate": 4.9396747619821925e-05,
+      "loss": 0.2247,
+      "step": 1460
+    },
+    {
+      "epoch": 1.5437302423603794,
+      "grad_norm": 0.451171875,
+      "learning_rate": 4.922921038668035e-05,
+      "loss": 0.2195,
+      "step": 1465
+    },
+    {
+      "epoch": 1.5489989462592202,
+      "grad_norm": 0.41796875,
+      "learning_rate": 4.906169720513176e-05,
+      "loss": 0.222,
+      "step": 1470
+    },
+    {
+      "epoch": 1.554267650158061,
+      "grad_norm": 0.431640625,
+      "learning_rate": 4.8894213302230055e-05,
+      "loss": 0.2275,
+      "step": 1475
+    },
+    {
+      "epoch": 1.559536354056902,
+      "grad_norm": 0.421875,
+      "learning_rate": 4.8726763904115556e-05,
+      "loss": 0.2193,
+      "step": 1480
+    },
+    {
+      "epoch": 1.5648050579557429,
+      "grad_norm": 0.44140625,
+      "learning_rate": 4.8559354235851854e-05,
+      "loss": 0.2289,
+      "step": 1485
+    },
+    {
+      "epoch": 1.5700737618545837,
+      "grad_norm": 0.41796875,
+      "learning_rate": 4.839198952126288e-05,
+      "loss": 0.2217,
+      "step": 1490
+    },
+    {
+      "epoch": 1.5753424657534247,
+      "grad_norm": 0.435546875,
+      "learning_rate": 4.8224674982769796e-05,
+      "loss": 0.2146,
+      "step": 1495
+    },
+    {
+      "epoch": 1.5806111696522656,
+      "grad_norm": 0.421875,
+      "learning_rate": 4.805741584122808e-05,
+      "loss": 0.2147,
+      "step": 1500
+    },
+    {
+      "epoch": 1.5806111696522656,
+      "eval_loss": 0.29037579894065857,
+      "eval_runtime": 2.4367,
+      "eval_samples_per_second": 19.289,
+      "eval_steps_per_second": 19.289,
+      "step": 1500
+    },
+    {
+      "epoch": 1.5858798735511064,
+      "grad_norm": 0.435546875,
+      "learning_rate": 4.78902173157646e-05,
+      "loss": 0.2182,
+      "step": 1505
+    },
+    {
+      "epoch": 1.5911485774499474,
+      "grad_norm": 0.423828125,
+      "learning_rate": 4.772308462361483e-05,
+      "loss": 0.2202,
+      "step": 1510
+    },
+    {
+      "epoch": 1.5964172813487882,
+      "grad_norm": 0.4296875,
+      "learning_rate": 4.7556022979959925e-05,
+      "loss": 0.2248,
+      "step": 1515
+    },
+    {
+      "epoch": 1.601685985247629,
+      "grad_norm": 0.41796875,
+      "learning_rate": 4.7389037597764084e-05,
+      "loss": 0.2185,
+      "step": 1520
+    },
+    {
+      "epoch": 1.60695468914647,
+      "grad_norm": 0.412109375,
+      "learning_rate": 4.7222133687611846e-05,
+      "loss": 0.2143,
+      "step": 1525
+    },
+    {
+      "epoch": 1.6122233930453107,
+      "grad_norm": 0.427734375,
+      "learning_rate": 4.705531645754552e-05,
+      "loss": 0.228,
+      "step": 1530
+    },
+    {
+      "epoch": 1.6174920969441517,
+      "grad_norm": 0.427734375,
+      "learning_rate": 4.688859111290268e-05,
+      "loss": 0.2205,
+      "step": 1535
+    },
+    {
+      "epoch": 1.6227608008429928,
+      "grad_norm": 0.435546875,
+      "learning_rate": 4.672196285615367e-05,
+      "loss": 0.2179,
+      "step": 1540
+    },
+    {
+      "epoch": 1.6280295047418334,
+      "grad_norm": 0.421875,
+      "learning_rate": 4.655543688673936e-05,
+      "loss": 0.2252,
+      "step": 1545
+    },
+    {
+      "epoch": 1.6332982086406744,
+      "grad_norm": 0.419921875,
+      "learning_rate": 4.638901840090886e-05,
+      "loss": 0.2207,
+      "step": 1550
+    },
+    {
+      "epoch": 1.6385669125395153,
+      "grad_norm": 0.42578125,
+      "learning_rate": 4.6222712591557375e-05,
+      "loss": 0.2213,
+      "step": 1555
+    },
+    {
+      "epoch": 1.643835616438356,
+      "grad_norm": 0.427734375,
+      "learning_rate": 4.6056524648064163e-05,
+      "loss": 0.2202,
+      "step": 1560
+    },
+    {
+      "epoch": 1.6491043203371971,
+      "grad_norm": 0.419921875,
+      "learning_rate": 4.589045975613062e-05,
+      "loss": 0.2162,
+      "step": 1565
+    },
+    {
+      "epoch": 1.654373024236038,
+      "grad_norm": 0.412109375,
+      "learning_rate": 4.572452309761847e-05,
+      "loss": 0.2169,
+      "step": 1570
+    },
+    {
+      "epoch": 1.6596417281348788,
+      "grad_norm": 0.439453125,
+      "learning_rate": 4.555871985038805e-05,
+      "loss": 0.2237,
+      "step": 1575
+    },
+    {
+      "epoch": 1.6649104320337198,
+      "grad_norm": 0.427734375,
+      "learning_rate": 4.539305518813677e-05,
+      "loss": 0.2207,
+      "step": 1580
+    },
+    {
+      "epoch": 1.6701791359325606,
+      "grad_norm": 0.4140625,
+      "learning_rate": 4.5227534280237645e-05,
+      "loss": 0.2167,
+      "step": 1585
+    },
+    {
+      "epoch": 1.6754478398314014,
+      "grad_norm": 0.423828125,
+      "learning_rate": 4.506216229157797e-05,
+      "loss": 0.2191,
+      "step": 1590
+    },
+    {
+      "epoch": 1.6807165437302425,
+      "grad_norm": 0.423828125,
+      "learning_rate": 4.489694438239827e-05,
+      "loss": 0.2171,
+      "step": 1595
+    },
+    {
+      "epoch": 1.685985247629083,
+      "grad_norm": 0.416015625,
+      "learning_rate": 4.4731885708131135e-05,
+      "loss": 0.2191,
+      "step": 1600
+    },
+    {
+      "epoch": 1.6912539515279241,
+      "grad_norm": 0.427734375,
+      "learning_rate": 4.456699141924041e-05,
+      "loss": 0.2195,
+      "step": 1605
+    },
+    {
+      "epoch": 1.6965226554267652,
+      "grad_norm": 0.419921875,
+      "learning_rate": 4.4402266661060535e-05,
+      "loss": 0.2168,
+      "step": 1610
+    },
+    {
+      "epoch": 1.7017913593256058,
+      "grad_norm": 0.42578125,
+      "learning_rate": 4.4237716573635895e-05,
+      "loss": 0.22,
+      "step": 1615
+    },
+    {
+      "epoch": 1.7070600632244468,
+      "grad_norm": 0.4140625,
+      "learning_rate": 4.40733462915605e-05,
+      "loss": 0.2103,
+      "step": 1620
+    },
+    {
+      "epoch": 1.7123287671232876,
+      "grad_norm": 0.453125,
+      "learning_rate": 4.390916094381774e-05,
+      "loss": 0.2154,
+      "step": 1625
+    },
+    {
+      "epoch": 1.7175974710221285,
+      "grad_norm": 0.45703125,
+      "learning_rate": 4.374516565362034e-05,
+      "loss": 0.223,
+      "step": 1630
+    },
+    {
+      "epoch": 1.7228661749209695,
+      "grad_norm": 0.419921875,
+      "learning_rate": 4.35813655382505e-05,
+      "loss": 0.2193,
+      "step": 1635
+    },
+    {
+      "epoch": 1.7281348788198103,
+      "grad_norm": 0.4453125,
+      "learning_rate": 4.341776570890024e-05,
+      "loss": 0.213,
+      "step": 1640
+    },
+    {
+      "epoch": 1.7334035827186511,
+      "grad_norm": 0.41796875,
+      "learning_rate": 4.325437127051184e-05,
+      "loss": 0.219,
+      "step": 1645
+    },
+    {
+      "epoch": 1.7386722866174922,
+      "grad_norm": 0.419921875,
+      "learning_rate": 4.309118732161865e-05,
+      "loss": 0.2188,
+      "step": 1650
+    },
+    {
+      "epoch": 1.743940990516333,
+      "grad_norm": 0.427734375,
+      "learning_rate": 4.292821895418588e-05,
+      "loss": 0.2181,
+      "step": 1655
+    },
+    {
+      "epoch": 1.7492096944151738,
+      "grad_norm": 0.42578125,
+      "learning_rate": 4.2765471253451824e-05,
+      "loss": 0.2168,
+      "step": 1660
+    },
+    {
+      "epoch": 1.7544783983140149,
+      "grad_norm": 0.4296875,
+      "learning_rate": 4.260294929776911e-05,
+      "loss": 0.2231,
+      "step": 1665
+    },
+    {
+      "epoch": 1.7597471022128557,
+      "grad_norm": 0.41015625,
+      "learning_rate": 4.244065815844624e-05,
+      "loss": 0.2084,
+      "step": 1670
+    },
+    {
+      "epoch": 1.7650158061116965,
+      "grad_norm": 0.421875,
+      "learning_rate": 4.227860289958938e-05,
+      "loss": 0.2095,
+      "step": 1675
+    },
+    {
+      "epoch": 1.7702845100105375,
+      "grad_norm": 0.431640625,
+      "learning_rate": 4.211678857794432e-05,
+      "loss": 0.2118,
+      "step": 1680
+    },
+    {
+      "epoch": 1.7755532139093781,
+      "grad_norm": 0.44140625,
+      "learning_rate": 4.1955220242738666e-05,
+      "loss": 0.2131,
+      "step": 1685
+    },
+    {
+      "epoch": 1.7808219178082192,
+      "grad_norm": 0.427734375,
+      "learning_rate": 4.1793902935524314e-05,
+      "loss": 0.2141,
+      "step": 1690
+    },
+    {
+      "epoch": 1.78609062170706,
+      "grad_norm": 0.421875,
+      "learning_rate": 4.163284169002013e-05,
+      "loss": 0.2102,
+      "step": 1695
+    },
+    {
+      "epoch": 1.7913593256059008,
+      "grad_norm": 0.435546875,
+      "learning_rate": 4.147204153195486e-05,
+      "loss": 0.2179,
+      "step": 1700
+    },
+    {
+      "epoch": 1.7966280295047419,
+      "grad_norm": 0.435546875,
+      "learning_rate": 4.1311507478910346e-05,
+      "loss": 0.2133,
+      "step": 1705
+    },
+    {
+      "epoch": 1.8018967334035827,
+      "grad_norm": 0.453125,
+      "learning_rate": 4.11512445401649e-05,
+      "loss": 0.2259,
+      "step": 1710
+    },
+    {
+      "epoch": 1.8071654373024235,
+      "grad_norm": 0.421875,
+      "learning_rate": 4.099125771653703e-05,
+      "loss": 0.2247,
+      "step": 1715
+    },
+    {
+      "epoch": 1.8124341412012646,
+      "grad_norm": 0.423828125,
+      "learning_rate": 4.083155200022942e-05,
+      "loss": 0.2104,
+      "step": 1720
+    },
+    {
+      "epoch": 1.8177028451001054,
+      "grad_norm": 0.416015625,
+      "learning_rate": 4.067213237467312e-05,
+      "loss": 0.2152,
+      "step": 1725
+    },
+    {
+      "epoch": 1.8229715489989462,
+      "grad_norm": 0.431640625,
+      "learning_rate": 4.051300381437201e-05,
+      "loss": 0.2213,
+      "step": 1730
+    },
+    {
+      "epoch": 1.8282402528977872,
+      "grad_norm": 0.408203125,
+      "learning_rate": 4.035417128474769e-05,
+      "loss": 0.2156,
+      "step": 1735
+    },
+    {
+      "epoch": 1.833508956796628,
+      "grad_norm": 0.423828125,
+      "learning_rate": 4.019563974198439e-05,
+      "loss": 0.2119,
+      "step": 1740
+    },
+    {
+      "epoch": 1.8387776606954689,
+      "grad_norm": 0.419921875,
+      "learning_rate": 4.0037414132874454e-05,
+      "loss": 0.2149,
+      "step": 1745
+    },
+    {
+      "epoch": 1.84404636459431,
+      "grad_norm": 0.404296875,
+      "learning_rate": 3.98794993946639e-05,
+      "loss": 0.207,
+      "step": 1750
+    },
+    {
+      "epoch": 1.8493150684931505,
+      "grad_norm": 0.423828125,
+      "learning_rate": 3.972190045489838e-05,
+      "loss": 0.207,
+      "step": 1755
+    },
+    {
+      "epoch": 1.8545837723919916,
+      "grad_norm": 0.412109375,
+      "learning_rate": 3.956462223126941e-05,
+      "loss": 0.2176,
+      "step": 1760
+    },
+    {
+      "epoch": 1.8598524762908326,
+      "grad_norm": 0.419921875,
+      "learning_rate": 3.940766963146097e-05,
+      "loss": 0.2195,
+      "step": 1765
+    },
+    {
+      "epoch": 1.8651211801896732,
+      "grad_norm": 0.419921875,
+      "learning_rate": 3.9251047552996304e-05,
+      "loss": 0.2203,
+      "step": 1770
+    },
+    {
+      "epoch": 1.8703898840885143,
+      "grad_norm": 0.43359375,
+      "learning_rate": 3.9094760883085096e-05,
+      "loss": 0.2217,
+      "step": 1775
+    },
+    {
+      "epoch": 1.875658587987355,
+      "grad_norm": 0.404296875,
+      "learning_rate": 3.8938814498471055e-05,
+      "loss": 0.2036,
+      "step": 1780
+    },
+    {
+      "epoch": 1.880927291886196,
+      "grad_norm": 0.41796875,
+      "learning_rate": 3.8783213265279634e-05,
+      "loss": 0.2165,
+      "step": 1785
+    },
+    {
+      "epoch": 1.886195995785037,
+      "grad_norm": 0.41796875,
+      "learning_rate": 3.8627962038866255e-05,
+      "loss": 0.2108,
+      "step": 1790
+    },
+    {
+      "epoch": 1.8914646996838778,
+      "grad_norm": 0.43359375,
+      "learning_rate": 3.84730656636648e-05,
+      "loss": 0.2106,
+      "step": 1795
+    },
+    {
+      "epoch": 1.8967334035827186,
+      "grad_norm": 0.41796875,
+      "learning_rate": 3.8318528973036395e-05,
+      "loss": 0.2144,
+      "step": 1800
+    },
+    {
+      "epoch": 1.9020021074815596,
+      "grad_norm": 0.41796875,
+      "learning_rate": 3.816435678911868e-05,
+      "loss": 0.2099,
+      "step": 1805
+    },
+    {
+      "epoch": 1.9072708113804004,
+      "grad_norm": 0.408203125,
+      "learning_rate": 3.801055392267523e-05,
+      "loss": 0.2168,
+      "step": 1810
+    },
+    {
+      "epoch": 1.9125395152792413,
+      "grad_norm": 0.408203125,
+      "learning_rate": 3.785712517294552e-05,
+      "loss": 0.2062,
+      "step": 1815
+    },
+    {
+      "epoch": 1.9178082191780823,
+      "grad_norm": 0.439453125,
+      "learning_rate": 3.770407532749519e-05,
+      "loss": 0.2215,
+      "step": 1820
+    },
+    {
+      "epoch": 1.9230769230769231,
+      "grad_norm": 0.416015625,
+      "learning_rate": 3.755140916206654e-05,
+      "loss": 0.2111,
+      "step": 1825
+    },
+    {
+      "epoch": 1.928345626975764,
+      "grad_norm": 0.421875,
+      "learning_rate": 3.739913144042963e-05,
+      "loss": 0.2109,
+      "step": 1830
+    },
+    {
+      "epoch": 1.933614330874605,
+      "grad_norm": 0.416015625,
+      "learning_rate": 3.7247246914233584e-05,
+      "loss": 0.2133,
+      "step": 1835
+    },
+    {
+      "epoch": 1.9388830347734456,
+      "grad_norm": 0.42578125,
+      "learning_rate": 3.709576032285829e-05,
+      "loss": 0.2128,
+      "step": 1840
+    },
+    {
+      "epoch": 1.9441517386722866,
+      "grad_norm": 0.42578125,
+      "learning_rate": 3.694467639326656e-05,
+      "loss": 0.2161,
+      "step": 1845
+    },
+    {
+      "epoch": 1.9494204425711275,
+      "grad_norm": 0.43359375,
+      "learning_rate": 3.679399983985663e-05,
+      "loss": 0.2215,
+      "step": 1850
+    },
+    {
+      "epoch": 1.9546891464699683,
+      "grad_norm": 0.412109375,
+      "learning_rate": 3.6643735364314995e-05,
+      "loss": 0.2111,
+      "step": 1855
+    },
+    {
+      "epoch": 1.9599578503688093,
+      "grad_norm": 0.41015625,
+      "learning_rate": 3.6493887655469796e-05,
+      "loss": 0.2141,
+      "step": 1860
+    },
+    {
+      "epoch": 1.9652265542676501,
+      "grad_norm": 0.4375,
+      "learning_rate": 3.63444613891444e-05,
+      "loss": 0.2102,
+      "step": 1865
+    },
+    {
+      "epoch": 1.970495258166491,
+      "grad_norm": 0.412109375,
+      "learning_rate": 3.619546122801158e-05,
+      "loss": 0.204,
+      "step": 1870
+    },
+    {
+      "epoch": 1.975763962065332,
+      "grad_norm": 0.419921875,
+      "learning_rate": 3.604689182144798e-05,
+      "loss": 0.2057,
+      "step": 1875
+    },
+    {
+      "epoch": 1.9810326659641728,
+      "grad_norm": 0.42578125,
+      "learning_rate": 3.589875780538906e-05,
+      "loss": 0.2107,
+      "step": 1880
+    },
+    {
+      "epoch": 1.9863013698630136,
+      "grad_norm": 0.4375,
+      "learning_rate": 3.575106380218442e-05,
+      "loss": 0.2202,
+      "step": 1885
+    },
+    {
+      "epoch": 1.9915700737618547,
+      "grad_norm": 0.4140625,
+      "learning_rate": 3.5603814420453566e-05,
+      "loss": 0.2075,
+      "step": 1890
+    },
+    {
+      "epoch": 1.9968387776606955,
+      "grad_norm": 0.416015625,
+      "learning_rate": 3.5457014254942126e-05,
+      "loss": 0.2163,
+      "step": 1895
+    },
+    {
+      "epoch": 1.9978925184404637,
+      "eval_loss": 0.28488120436668396,
+      "eval_runtime": 2.5242,
+      "eval_samples_per_second": 18.619,
+      "eval_steps_per_second": 18.619,
+      "step": 1896
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 2847,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.051871750229852e+18,
+  "train_batch_size": 100,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:63dcfa6fe661e6522819316451dcbd41131cc208eb88ad00e77b83a795a4ccfe
+size 5624

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff