Training in progress, step 150, checkpoint

Browse files

Files changed (8) hide show

last-checkpoint/adapter_config.json +3 -5
last-checkpoint/adapter_model.safetensors +2 -2
last-checkpoint/optimizer.pt +2 -2
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/tokenizer.json +2 -2
last-checkpoint/trainer_state.json +54 -1862
last-checkpoint/training_args.bin +1 -1

last-checkpoint/adapter_config.json CHANGED Viewed

@@ -14,17 +14,15 @@
   "lora_dropout": 0.1,
   "megatron_config": null,
   "megatron_core": "megatron.core",
-  "modules_to_save": [
-    "lm_head"
-  ],
   "peft_type": "LORA",
   "r": 8,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "q_proj",
-    "o_proj",
     "k_proj",
     "v_proj"
   ],
   "task_type": "CAUSAL_LM",

   "lora_dropout": 0.1,
   "megatron_config": null,
   "megatron_core": "megatron.core",
+  "modules_to_save": [],
   "peft_type": "LORA",
   "r": 8,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "k_proj",
+    "o_proj",
+    "q_proj",
     "v_proj"
   ],
   "task_type": "CAUSAL_LM",

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e70898adc0001cf8d3cbd88db122187cf34c72c60ab6b0d25f03fe79664bfbe6
-size 532169208

 version https://git-lfs.github.com/spec/v1
+oid sha256:58a6dac7b8788cdf686e0493d47f426ab4e6370365936cf0127fc02865db7e27
+size 6832520

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ba56b0e54f1167dc39906ab61863f6f7efa026a9bdc2767ad4e86e543d464ac2
-size 1064413498

 version https://git-lfs.github.com/spec/v1
+oid sha256:7192f33aec7dae1d9eb5f8b448f192cc1be6d1e1d1fce1464dfc671d4cd435f0
+size 13739450

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a0e8cd75e9fb00d7e1c3e742ef94f1b5f25795aee0fd9312b8f905404776c3bb
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:ba79f79f4200644bcde298b8ba358af98910b10cc152e720addca023a2e47a37
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2608ab80398bcdaa9fbf626bf9fbe36aaf04727adaa3a55c222330d8a358f2f7
 size 1256

 version https://git-lfs.github.com/spec/v1
+oid sha256:6186f1e9836e20fd270e1ab773c83f1027d92e426fd1d0a8c7816f8a9115c5fd
 size 1256

last-checkpoint/tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
-size 17209920

 version https://git-lfs.github.com/spec/v1
+oid sha256:52716f60c3ad328509fa37cdded9a2f1196ecae463f5480f5d38c66a25e7a7dc
+size 17210019

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,1933 +1,125 @@
 {
-  "best_metric": 1.9544912576675415,
-  "best_model_checkpoint": "./output/checkpoint-1500",
-  "epoch": 3.256704980842912,
   "eval_steps": 150,
-  "global_step": 2550,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.01277139208173691,
-      "grad_norm": 3.200807571411133,
-      "learning_rate": 1.4e-06,
-      "loss": 1.9026,
       "step": 10
     },
     {
       "epoch": 0.02554278416347382,
-      "grad_norm": 3.294923782348633,
-      "learning_rate": 2.8e-06,
-      "loss": 1.9388,
       "step": 20
     },
     {
       "epoch": 0.038314176245210725,
-      "grad_norm": 3.206088066101074,
-      "learning_rate": 4.2e-06,
-      "loss": 1.9022,
       "step": 30
     },
     {
       "epoch": 0.05108556832694764,
-      "grad_norm": 3.0478761196136475,
-      "learning_rate": 5.6e-06,
-      "loss": 1.8711,
       "step": 40
     },
     {
       "epoch": 0.06385696040868455,
-      "grad_norm": 3.636721611022949,
-      "learning_rate": 7e-06,
-      "loss": 1.9732,
       "step": 50
     },
     {
       "epoch": 0.07662835249042145,
-      "grad_norm": 3.1977827548980713,
-      "learning_rate": 8.4e-06,
-      "loss": 1.9163,
       "step": 60
     },
     {
       "epoch": 0.08939974457215837,
-      "grad_norm": 2.9971323013305664,
-      "learning_rate": 9.8e-06,
-      "loss": 1.9435,
       "step": 70
     },
     {
       "epoch": 0.10217113665389528,
-      "grad_norm": 3.2469263076782227,
-      "learning_rate": 1.12e-05,
-      "loss": 1.9411,
       "step": 80
     },
     {
       "epoch": 0.11494252873563218,
-      "grad_norm": 3.1737616062164307,
-      "learning_rate": 1.2599999999999998e-05,
-      "loss": 1.9255,
       "step": 90
     },
     {
       "epoch": 0.1277139208173691,
-      "grad_norm": 3.41223406791687,
-      "learning_rate": 1.4e-05,
-      "loss": 1.9807,
       "step": 100
     },
     {
       "epoch": 0.140485312899106,
-      "grad_norm": 2.883166551589966,
-      "learning_rate": 1.5399999999999998e-05,
-      "loss": 1.9229,
       "step": 110
     },
     {
       "epoch": 0.1532567049808429,
-      "grad_norm": 2.956953287124634,
-      "learning_rate": 1.68e-05,
-      "loss": 1.9432,
       "step": 120
     },
     {
       "epoch": 0.16602809706257982,
-      "grad_norm": 3.3282694816589355,
-      "learning_rate": 1.82e-05,
-      "loss": 1.9258,
       "step": 130
     },
     {
       "epoch": 0.17879948914431673,
-      "grad_norm": 3.1125221252441406,
-      "learning_rate": 1.96e-05,
-      "loss": 1.9145,
       "step": 140
     },
     {
       "epoch": 0.19157088122605365,
-      "grad_norm": 3.033933639526367,
-      "learning_rate": 2.1e-05,
-      "loss": 1.9399,
       "step": 150
     },
     {
       "epoch": 0.19157088122605365,
-      "eval_loss": 1.9976754188537598,
-      "eval_runtime": 24.9012,
-      "eval_samples_per_second": 20.079,
-      "eval_steps_per_second": 20.079,
       "step": 150
-    },
-    {
-      "epoch": 0.20434227330779056,
-      "grad_norm": 3.3137006759643555,
-      "learning_rate": 2.24e-05,
-      "loss": 1.8722,
-      "step": 160
-    },
-    {
-      "epoch": 0.21711366538952745,
-      "grad_norm": 3.511589288711548,
-      "learning_rate": 2.38e-05,
-      "loss": 1.897,
-      "step": 170
-    },
-    {
-      "epoch": 0.22988505747126436,
-      "grad_norm": 3.4723284244537354,
-      "learning_rate": 2.5199999999999996e-05,
-      "loss": 1.8827,
-      "step": 180
-    },
-    {
-      "epoch": 0.24265644955300128,
-      "grad_norm": 3.296243667602539,
-      "learning_rate": 2.66e-05,
-      "loss": 1.9298,
-      "step": 190
-    },
-    {
-      "epoch": 0.2554278416347382,
-      "grad_norm": 3.188915491104126,
-      "learning_rate": 2.8e-05,
-      "loss": 1.9129,
-      "step": 200
-    },
-    {
-      "epoch": 0.2681992337164751,
-      "grad_norm": 3.2900846004486084,
-      "learning_rate": 2.9399999999999996e-05,
-      "loss": 1.8282,
-      "step": 210
-    },
-    {
-      "epoch": 0.280970625798212,
-      "grad_norm": 3.3161473274230957,
-      "learning_rate": 3.0799999999999996e-05,
-      "loss": 1.9102,
-      "step": 220
-    },
-    {
-      "epoch": 0.2937420178799489,
-      "grad_norm": 3.259005546569824,
-      "learning_rate": 3.22e-05,
-      "loss": 1.948,
-      "step": 230
-    },
-    {
-      "epoch": 0.3065134099616858,
-      "grad_norm": 3.051255941390991,
-      "learning_rate": 3.36e-05,
-      "loss": 1.8954,
-      "step": 240
-    },
-    {
-      "epoch": 0.31928480204342274,
-      "grad_norm": 3.3548150062561035,
-      "learning_rate": 3.5e-05,
-      "loss": 1.9701,
-      "step": 250
-    },
-    {
-      "epoch": 0.33205619412515963,
-      "grad_norm": 2.9780640602111816,
-      "learning_rate": 3.64e-05,
-      "loss": 1.908,
-      "step": 260
-    },
-    {
-      "epoch": 0.3448275862068966,
-      "grad_norm": 3.386885404586792,
-      "learning_rate": 3.78e-05,
-      "loss": 1.9511,
-      "step": 270
-    },
-    {
-      "epoch": 0.35759897828863346,
-      "grad_norm": 3.263582229614258,
-      "learning_rate": 3.92e-05,
-      "loss": 1.9289,
-      "step": 280
-    },
-    {
-      "epoch": 0.37037037037037035,
-      "grad_norm": 3.1607635021209717,
-      "learning_rate": 4.059999999999999e-05,
-      "loss": 1.9594,
-      "step": 290
-    },
-    {
-      "epoch": 0.3831417624521073,
-      "grad_norm": 3.176176071166992,
-      "learning_rate": 4.2e-05,
-      "loss": 1.9399,
-      "step": 300
-    },
-    {
-      "epoch": 0.3831417624521073,
-      "eval_loss": 1.9805161952972412,
-      "eval_runtime": 28.0922,
-      "eval_samples_per_second": 17.799,
-      "eval_steps_per_second": 17.799,
-      "step": 300
-    },
-    {
-      "epoch": 0.3959131545338442,
-      "grad_norm": 3.4425015449523926,
-      "learning_rate": 4.34e-05,
-      "loss": 1.8902,
-      "step": 310
-    },
-    {
-      "epoch": 0.4086845466155811,
-      "grad_norm": 3.13578724861145,
-      "learning_rate": 4.48e-05,
-      "loss": 1.9275,
-      "step": 320
-    },
-    {
-      "epoch": 0.421455938697318,
-      "grad_norm": 3.6584582328796387,
-      "learning_rate": 4.62e-05,
-      "loss": 1.9625,
-      "step": 330
-    },
-    {
-      "epoch": 0.4342273307790549,
-      "grad_norm": 3.0934386253356934,
-      "learning_rate": 4.76e-05,
-      "loss": 1.882,
-      "step": 340
-    },
-    {
-      "epoch": 0.44699872286079184,
-      "grad_norm": 3.1788082122802734,
-      "learning_rate": 4.899999999999999e-05,
-      "loss": 1.8866,
-      "step": 350
-    },
-    {
-      "epoch": 0.45977011494252873,
-      "grad_norm": 3.1974220275878906,
-      "learning_rate": 5.039999999999999e-05,
-      "loss": 1.9472,
-      "step": 360
-    },
-    {
-      "epoch": 0.4725415070242657,
-      "grad_norm": 3.3801984786987305,
-      "learning_rate": 5.179999999999999e-05,
-      "loss": 1.9249,
-      "step": 370
-    },
-    {
-      "epoch": 0.48531289910600256,
-      "grad_norm": 3.156172037124634,
-      "learning_rate": 5.32e-05,
-      "loss": 1.9025,
-      "step": 380
-    },
-    {
-      "epoch": 0.49808429118773945,
-      "grad_norm": 3.3196117877960205,
-      "learning_rate": 5.46e-05,
-      "loss": 1.8319,
-      "step": 390
-    },
-    {
-      "epoch": 0.5108556832694764,
-      "grad_norm": 3.410414457321167,
-      "learning_rate": 5.6e-05,
-      "loss": 1.9329,
-      "step": 400
-    },
-    {
-      "epoch": 0.5236270753512133,
-      "grad_norm": 3.050872325897217,
-      "learning_rate": 5.739999999999999e-05,
-      "loss": 1.936,
-      "step": 410
-    },
-    {
-      "epoch": 0.5363984674329502,
-      "grad_norm": 3.5527184009552,
-      "learning_rate": 5.879999999999999e-05,
-      "loss": 1.9004,
-      "step": 420
-    },
-    {
-      "epoch": 0.5491698595146871,
-      "grad_norm": 3.099611282348633,
-      "learning_rate": 6.019999999999999e-05,
-      "loss": 1.9061,
-      "step": 430
-    },
-    {
-      "epoch": 0.561941251596424,
-      "grad_norm": 3.0915510654449463,
-      "learning_rate": 6.159999999999999e-05,
-      "loss": 1.9015,
-      "step": 440
-    },
-    {
-      "epoch": 0.5747126436781609,
-      "grad_norm": 3.2416725158691406,
-      "learning_rate": 6.3e-05,
-      "loss": 1.8938,
-      "step": 450
-    },
-    {
-      "epoch": 0.5747126436781609,
-      "eval_loss": 1.970125675201416,
-      "eval_runtime": 24.7662,
-      "eval_samples_per_second": 20.189,
-      "eval_steps_per_second": 20.189,
-      "step": 450
-    },
-    {
-      "epoch": 0.5874840357598978,
-      "grad_norm": 2.9722657203674316,
-      "learning_rate": 6.44e-05,
-      "loss": 1.869,
-      "step": 460
-    },
-    {
-      "epoch": 0.6002554278416348,
-      "grad_norm": 3.058877468109131,
-      "learning_rate": 6.579999999999999e-05,
-      "loss": 1.856,
-      "step": 470
-    },
-    {
-      "epoch": 0.6130268199233716,
-      "grad_norm": 3.3449816703796387,
-      "learning_rate": 6.72e-05,
-      "loss": 1.8573,
-      "step": 480
-    },
-    {
-      "epoch": 0.6257982120051085,
-      "grad_norm": 3.0998528003692627,
-      "learning_rate": 6.859999999999999e-05,
-      "loss": 1.8721,
-      "step": 490
-    },
-    {
-      "epoch": 0.6385696040868455,
-      "grad_norm": 3.2949531078338623,
-      "learning_rate": 7e-05,
-      "loss": 1.9653,
-      "step": 500
-    },
-    {
-      "epoch": 0.6513409961685823,
-      "grad_norm": 2.965726137161255,
-      "learning_rate": 6.99991470746888e-05,
-      "loss": 1.8915,
-      "step": 510
-    },
-    {
-      "epoch": 0.6641123882503193,
-      "grad_norm": 3.32828950881958,
-      "learning_rate": 6.999658834032565e-05,
-      "loss": 1.8694,
-      "step": 520
-    },
-    {
-      "epoch": 0.6768837803320562,
-      "grad_norm": 3.0084595680236816,
-      "learning_rate": 6.999232392161959e-05,
-      "loss": 1.8969,
-      "step": 530
-    },
-    {
-      "epoch": 0.6896551724137931,
-      "grad_norm": 3.276498556137085,
-      "learning_rate": 6.99863540264124e-05,
-      "loss": 1.9219,
-      "step": 540
-    },
-    {
-      "epoch": 0.70242656449553,
-      "grad_norm": 3.205116033554077,
-      "learning_rate": 6.997867894566835e-05,
-      "loss": 1.9566,
-      "step": 550
-    },
-    {
-      "epoch": 0.7151979565772669,
-      "grad_norm": 3.1132071018218994,
-      "learning_rate": 6.996929905346004e-05,
-      "loss": 1.9083,
-      "step": 560
-    },
-    {
-      "epoch": 0.7279693486590039,
-      "grad_norm": 2.9555888175964355,
-      "learning_rate": 6.995821480695019e-05,
-      "loss": 1.8563,
-      "step": 570
-    },
-    {
-      "epoch": 0.7407407407407407,
-      "grad_norm": 3.2956736087799072,
-      "learning_rate": 6.994542674636933e-05,
-      "loss": 1.8805,
-      "step": 580
-    },
-    {
-      "epoch": 0.7535121328224776,
-      "grad_norm": 3.101565361022949,
-      "learning_rate": 6.99309354949895e-05,
-      "loss": 1.8884,
-      "step": 590
-    },
-    {
-      "epoch": 0.7662835249042146,
-      "grad_norm": 3.5506794452667236,
-      "learning_rate": 6.991474175909384e-05,
-      "loss": 1.9324,
-      "step": 600
-    },
-    {
-      "epoch": 0.7662835249042146,
-      "eval_loss": 1.9721698760986328,
-      "eval_runtime": 27.9596,
-      "eval_samples_per_second": 17.883,
-      "eval_steps_per_second": 17.883,
-      "step": 600
-    },
-    {
-      "epoch": 0.7790549169859514,
-      "grad_norm": 3.1406140327453613,
-      "learning_rate": 6.989684632794221e-05,
-      "loss": 1.8976,
-      "step": 610
-    },
-    {
-      "epoch": 0.7918263090676884,
-      "grad_norm": 3.364961624145508,
-      "learning_rate": 6.987725007373265e-05,
-      "loss": 1.8477,
-      "step": 620
-    },
-    {
-      "epoch": 0.8045977011494253,
-      "grad_norm": 2.9893288612365723,
-      "learning_rate": 6.985595395155896e-05,
-      "loss": 1.9148,
-      "step": 630
-    },
-    {
-      "epoch": 0.8173690932311622,
-      "grad_norm": 3.107212781906128,
-      "learning_rate": 6.983295899936408e-05,
-      "loss": 1.8768,
-      "step": 640
-    },
-    {
-      "epoch": 0.8301404853128991,
-      "grad_norm": 3.053358554840088,
-      "learning_rate": 6.980826633788956e-05,
-      "loss": 1.9073,
-      "step": 650
-    },
-    {
-      "epoch": 0.842911877394636,
-      "grad_norm": 2.9061360359191895,
-      "learning_rate": 6.978187717062086e-05,
-      "loss": 1.8477,
-      "step": 660
-    },
-    {
-      "epoch": 0.855683269476373,
-      "grad_norm": 2.9826648235321045,
-      "learning_rate": 6.975379278372878e-05,
-      "loss": 1.8508,
-      "step": 670
-    },
-    {
-      "epoch": 0.8684546615581098,
-      "grad_norm": 3.2406656742095947,
-      "learning_rate": 6.972401454600672e-05,
-      "loss": 1.9066,
-      "step": 680
-    },
-    {
-      "epoch": 0.8812260536398467,
-      "grad_norm": 3.206275463104248,
-      "learning_rate": 6.969254390880395e-05,
-      "loss": 1.9108,
-      "step": 690
-    },
-    {
-      "epoch": 0.8939974457215837,
-      "grad_norm": 3.2281596660614014,
-      "learning_rate": 6.965938240595496e-05,
-      "loss": 1.9246,
-      "step": 700
-    },
-    {
-      "epoch": 0.9067688378033205,
-      "grad_norm": 3.3281443119049072,
-      "learning_rate": 6.962453165370459e-05,
-      "loss": 1.9756,
-      "step": 710
-    },
-    {
-      "epoch": 0.9195402298850575,
-      "grad_norm": 3.3063316345214844,
-      "learning_rate": 6.958799335062934e-05,
-      "loss": 1.8813,
-      "step": 720
-    },
-    {
-      "epoch": 0.9323116219667944,
-      "grad_norm": 2.862231969833374,
-      "learning_rate": 6.954976927755458e-05,
-      "loss": 1.8586,
-      "step": 730
-    },
-    {
-      "epoch": 0.9450830140485313,
-      "grad_norm": 3.3289144039154053,
-      "learning_rate": 6.950986129746767e-05,
-      "loss": 1.8932,
-      "step": 740
-    },
-    {
-      "epoch": 0.9578544061302682,
-      "grad_norm": 2.8503053188323975,
-      "learning_rate": 6.946827135542728e-05,
-      "loss": 1.8778,
-      "step": 750
-    },
-    {
-      "epoch": 0.9578544061302682,
-      "eval_loss": 1.9627635478973389,
-      "eval_runtime": 28.7652,
-      "eval_samples_per_second": 17.382,
-      "eval_steps_per_second": 17.382,
-      "step": 750
-    },
-    {
-      "epoch": 0.9706257982120051,
-      "grad_norm": 3.0852253437042236,
-      "learning_rate": 6.94250014784685e-05,
-      "loss": 1.9039,
-      "step": 760
-    },
-    {
-      "epoch": 0.9833971902937421,
-      "grad_norm": 3.4453940391540527,
-      "learning_rate": 6.93800537755041e-05,
-      "loss": 1.8881,
-      "step": 770
-    },
-    {
-      "epoch": 0.9961685823754789,
-      "grad_norm": 2.987304925918579,
-      "learning_rate": 6.93334304372217e-05,
-      "loss": 1.8828,
-      "step": 780
-    },
-    {
-      "epoch": 1.0089399744572158,
-      "grad_norm": 2.9873013496398926,
-      "learning_rate": 6.928513373597703e-05,
-      "loss": 1.8152,
-      "step": 790
-    },
-    {
-      "epoch": 1.0217113665389528,
-      "grad_norm": 3.0838310718536377,
-      "learning_rate": 6.923516602568319e-05,
-      "loss": 1.7415,
-      "step": 800
-    },
-    {
-      "epoch": 1.0344827586206897,
-      "grad_norm": 3.1033427715301514,
-      "learning_rate": 6.918352974169587e-05,
-      "loss": 1.7952,
-      "step": 810
-    },
-    {
-      "epoch": 1.0472541507024267,
-      "grad_norm": 3.1028897762298584,
-      "learning_rate": 6.913022740069471e-05,
-      "loss": 1.7518,
-      "step": 820
-    },
-    {
-      "epoch": 1.0600255427841634,
-      "grad_norm": 3.008892297744751,
-      "learning_rate": 6.90752616005606e-05,
-      "loss": 1.7723,
-      "step": 830
-    },
-    {
-      "epoch": 1.0727969348659003,
-      "grad_norm": 3.101642370223999,
-      "learning_rate": 6.901863502024912e-05,
-      "loss": 1.7136,
-      "step": 840
-    },
-    {
-      "epoch": 1.0855683269476373,
-      "grad_norm": 3.0864851474761963,
-      "learning_rate": 6.896035041965987e-05,
-      "loss": 1.7585,
-      "step": 850
-    },
-    {
-      "epoch": 1.0983397190293742,
-      "grad_norm": 3.0635933876037598,
-      "learning_rate": 6.890041063950208e-05,
-      "loss": 1.7557,
-      "step": 860
-    },
-    {
-      "epoch": 1.1111111111111112,
-      "grad_norm": 3.0961105823516846,
-      "learning_rate": 6.883881860115608e-05,
-      "loss": 1.7491,
-      "step": 870
-    },
-    {
-      "epoch": 1.123882503192848,
-      "grad_norm": 3.00886607170105,
-      "learning_rate": 6.87755773065309e-05,
-      "loss": 1.7299,
-      "step": 880
-    },
-    {
-      "epoch": 1.136653895274585,
-      "grad_norm": 2.9253010749816895,
-      "learning_rate": 6.871068983791803e-05,
-      "loss": 1.7236,
-      "step": 890
-    },
-    {
-      "epoch": 1.1494252873563218,
-      "grad_norm": 3.2435286045074463,
-      "learning_rate": 6.864415935784116e-05,
-      "loss": 1.7262,
-      "step": 900
-    },
-    {
-      "epoch": 1.1494252873563218,
-      "eval_loss": 1.9637728929519653,
-      "eval_runtime": 26.2765,
-      "eval_samples_per_second": 19.028,
-      "eval_steps_per_second": 19.028,
-      "step": 900
-    },
-    {
-      "epoch": 1.1621966794380587,
-      "grad_norm": 2.878432512283325,
-      "learning_rate": 6.8575989108902e-05,
-      "loss": 1.7455,
-      "step": 910
-    },
-    {
-      "epoch": 1.1749680715197957,
-      "grad_norm": 2.9958291053771973,
-      "learning_rate": 6.850618241362235e-05,
-      "loss": 1.7218,
-      "step": 920
-    },
-    {
-      "epoch": 1.1877394636015326,
-      "grad_norm": 3.107431173324585,
-      "learning_rate": 6.843474267428202e-05,
-      "loss": 1.7237,
-      "step": 930
-    },
-    {
-      "epoch": 1.2005108556832695,
-      "grad_norm": 3.1416854858398438,
-      "learning_rate": 6.836167337275314e-05,
-      "loss": 1.7539,
-      "step": 940
-    },
-    {
-      "epoch": 1.2132822477650063,
-      "grad_norm": 3.1158761978149414,
-      "learning_rate": 6.828697807033038e-05,
-      "loss": 1.7637,
-      "step": 950
-    },
-    {
-      "epoch": 1.2260536398467432,
-      "grad_norm": 2.7915549278259277,
-      "learning_rate": 6.821066040755737e-05,
-      "loss": 1.688,
-      "step": 960
-    },
-    {
-      "epoch": 1.2388250319284801,
-      "grad_norm": 3.083923816680908,
-      "learning_rate": 6.813272410404936e-05,
-      "loss": 1.7765,
-      "step": 970
-    },
-    {
-      "epoch": 1.251596424010217,
-      "grad_norm": 3.111654281616211,
-      "learning_rate": 6.805317295831182e-05,
-      "loss": 1.7157,
-      "step": 980
-    },
-    {
-      "epoch": 1.264367816091954,
-      "grad_norm": 2.9396989345550537,
-      "learning_rate": 6.797201084755538e-05,
-      "loss": 1.7147,
-      "step": 990
-    },
-    {
-      "epoch": 1.277139208173691,
-      "grad_norm": 3.1540298461914062,
-      "learning_rate": 6.788924172750679e-05,
-      "loss": 1.755,
-      "step": 1000
-    },
-    {
-      "epoch": 1.289910600255428,
-      "grad_norm": 3.1720669269561768,
-      "learning_rate": 6.78048696322162e-05,
-      "loss": 1.8097,
-      "step": 1010
-    },
-    {
-      "epoch": 1.3026819923371646,
-      "grad_norm": 3.1514675617218018,
-      "learning_rate": 6.77188986738605e-05,
-      "loss": 1.7045,
-      "step": 1020
-    },
-    {
-      "epoch": 1.3154533844189016,
-      "grad_norm": 2.926434278488159,
-      "learning_rate": 6.763133304254292e-05,
-      "loss": 1.7047,
-      "step": 1030
-    },
-    {
-      "epoch": 1.3282247765006385,
-      "grad_norm": 3.011573314666748,
-      "learning_rate": 6.75421770060888e-05,
-      "loss": 1.7246,
-      "step": 1040
-    },
-    {
-      "epoch": 1.3409961685823755,
-      "grad_norm": 3.360208749771118,
-      "learning_rate": 6.745143490983756e-05,
-      "loss": 1.823,
-      "step": 1050
-    },
-    {
-      "epoch": 1.3409961685823755,
-      "eval_loss": 1.9604660272598267,
-      "eval_runtime": 28.9496,
-      "eval_samples_per_second": 17.271,
-      "eval_steps_per_second": 17.271,
-      "step": 1050
-    },
-    {
-      "epoch": 1.3537675606641124,
-      "grad_norm": 3.0648999214172363,
-      "learning_rate": 6.735911117643095e-05,
-      "loss": 1.7675,
-      "step": 1060
-    },
-    {
-      "epoch": 1.3665389527458494,
-      "grad_norm": 2.986990451812744,
-      "learning_rate": 6.726521030559751e-05,
-      "loss": 1.7806,
-      "step": 1070
-    },
-    {
-      "epoch": 1.3793103448275863,
-      "grad_norm": 2.9186227321624756,
-      "learning_rate": 6.71697368739332e-05,
-      "loss": 1.7838,
-      "step": 1080
-    },
-    {
-      "epoch": 1.392081736909323,
-      "grad_norm": 2.8783702850341797,
-      "learning_rate": 6.707269553467838e-05,
-      "loss": 1.7621,
-      "step": 1090
-    },
-    {
-      "epoch": 1.40485312899106,
-      "grad_norm": 3.190941333770752,
-      "learning_rate": 6.697409101749102e-05,
-      "loss": 1.7883,
-      "step": 1100
-    },
-    {
-      "epoch": 1.417624521072797,
-      "grad_norm": 3.060194969177246,
-      "learning_rate": 6.687392812821619e-05,
-      "loss": 1.7676,
-      "step": 1110
-    },
-    {
-      "epoch": 1.4303959131545338,
-      "grad_norm": 3.0321028232574463,
-      "learning_rate": 6.677221174865179e-05,
-      "loss": 1.7914,
-      "step": 1120
-    },
-    {
-      "epoch": 1.4431673052362708,
-      "grad_norm": 2.939692497253418,
-      "learning_rate": 6.666894683631068e-05,
-      "loss": 1.7739,
-      "step": 1130
-    },
-    {
-      "epoch": 1.4559386973180077,
-      "grad_norm": 2.909308433532715,
-      "learning_rate": 6.656413842417897e-05,
-      "loss": 1.7782,
-      "step": 1140
-    },
-    {
-      "epoch": 1.4687100893997447,
-      "grad_norm": 3.142610788345337,
-      "learning_rate": 6.645779162047084e-05,
-      "loss": 1.7468,
-      "step": 1150
-    },
-    {
-      "epoch": 1.4814814814814814,
-      "grad_norm": 3.127857208251953,
-      "learning_rate": 6.634991160837945e-05,
-      "loss": 1.8058,
-      "step": 1160
-    },
-    {
-      "epoch": 1.4942528735632183,
-      "grad_norm": 2.9502968788146973,
-      "learning_rate": 6.624050364582439e-05,
-      "loss": 1.7747,
-      "step": 1170
-    },
-    {
-      "epoch": 1.5070242656449553,
-      "grad_norm": 3.0171525478363037,
-      "learning_rate": 6.612957306519541e-05,
-      "loss": 1.8112,
-      "step": 1180
-    },
-    {
-      "epoch": 1.5197956577266922,
-      "grad_norm": 3.0701234340667725,
-      "learning_rate": 6.60171252730925e-05,
-      "loss": 1.7723,
-      "step": 1190
-    },
-    {
-      "epoch": 1.5325670498084292,
-      "grad_norm": 3.009382486343384,
-      "learning_rate": 6.590316575006243e-05,
-      "loss": 1.815,
-      "step": 1200
-    },
-    {
-      "epoch": 1.5325670498084292,
-      "eval_loss": 1.9615885019302368,
-      "eval_runtime": 27.2206,
-      "eval_samples_per_second": 18.368,
-      "eval_steps_per_second": 18.368,
-      "step": 1200
-    },
-    {
-      "epoch": 1.545338441890166,
-      "grad_norm": 2.813608407974243,
-      "learning_rate": 6.578770005033157e-05,
-      "loss": 1.7336,
-      "step": 1210
-    },
-    {
-      "epoch": 1.558109833971903,
-      "grad_norm": 3.030839443206787,
-      "learning_rate": 6.567073380153521e-05,
-      "loss": 1.7409,
-      "step": 1220
-    },
-    {
-      "epoch": 1.5708812260536398,
-      "grad_norm": 2.8138840198516846,
-      "learning_rate": 6.555227270444334e-05,
-      "loss": 1.7933,
-      "step": 1230
-    },
-    {
-      "epoch": 1.5836526181353767,
-      "grad_norm": 2.8430979251861572,
-      "learning_rate": 6.543232253268266e-05,
-      "loss": 1.7454,
-      "step": 1240
-    },
-    {
-      "epoch": 1.5964240102171137,
-      "grad_norm": 3.018444538116455,
-      "learning_rate": 6.531088913245535e-05,
-      "loss": 1.7844,
-      "step": 1250
-    },
-    {
-      "epoch": 1.6091954022988506,
-      "grad_norm": 2.859196662902832,
-      "learning_rate": 6.518797842225401e-05,
-      "loss": 1.7814,
-      "step": 1260
-    },
-    {
-      "epoch": 1.6219667943805876,
-      "grad_norm": 2.9246723651885986,
-      "learning_rate": 6.506359639257325e-05,
-      "loss": 1.7289,
-      "step": 1270
-    },
-    {
-      "epoch": 1.6347381864623243,
-      "grad_norm": 3.0037896633148193,
-      "learning_rate": 6.493774910561772e-05,
-      "loss": 1.7259,
-      "step": 1280
-    },
-    {
-      "epoch": 1.6475095785440614,
-      "grad_norm": 3.0603103637695312,
-      "learning_rate": 6.481044269500665e-05,
-      "loss": 1.6821,
-      "step": 1290
-    },
-    {
-      "epoch": 1.6602809706257982,
-      "grad_norm": 3.065397262573242,
-      "learning_rate": 6.46816833654749e-05,
-      "loss": 1.8109,
-      "step": 1300
-    },
-    {
-      "epoch": 1.673052362707535,
-      "grad_norm": 3.0101401805877686,
-      "learning_rate": 6.455147739257053e-05,
-      "loss": 1.7583,
-      "step": 1310
-    },
-    {
-      "epoch": 1.685823754789272,
-      "grad_norm": 3.0239782333374023,
-      "learning_rate": 6.441983112234894e-05,
-      "loss": 1.759,
-      "step": 1320
-    },
-    {
-      "epoch": 1.698595146871009,
-      "grad_norm": 3.068173408508301,
-      "learning_rate": 6.428675097106366e-05,
-      "loss": 1.7957,
-      "step": 1330
-    },
-    {
-      "epoch": 1.711366538952746,
-      "grad_norm": 3.2432665824890137,
-      "learning_rate": 6.415224342485348e-05,
-      "loss": 1.7558,
-      "step": 1340
-    },
-    {
-      "epoch": 1.7241379310344827,
-      "grad_norm": 3.144869327545166,
-      "learning_rate": 6.401631503942645e-05,
-      "loss": 1.7613,
-      "step": 1350
-    },
-    {
-      "epoch": 1.7241379310344827,
-      "eval_loss": 1.9588252305984497,
-      "eval_runtime": 27.7722,
-      "eval_samples_per_second": 18.004,
-      "eval_steps_per_second": 18.004,
-      "step": 1350
-    },
-    {
-      "epoch": 1.7369093231162198,
-      "grad_norm": 2.8430140018463135,
-      "learning_rate": 6.387897243974032e-05,
-      "loss": 1.767,
-      "step": 1360
-    },
-    {
-      "epoch": 1.7496807151979565,
-      "grad_norm": 3.0153555870056152,
-      "learning_rate": 6.374022231967963e-05,
-      "loss": 1.7671,
-      "step": 1370
-    },
-    {
-      "epoch": 1.7624521072796935,
-      "grad_norm": 2.876494884490967,
-      "learning_rate": 6.360007144172949e-05,
-      "loss": 1.7545,
-      "step": 1380
-    },
-    {
-      "epoch": 1.7752234993614304,
-      "grad_norm": 3.244321823120117,
-      "learning_rate": 6.345852663664596e-05,
-      "loss": 1.8216,
-      "step": 1390
-    },
-    {
-      "epoch": 1.7879948914431671,
-      "grad_norm": 2.9006967544555664,
-      "learning_rate": 6.331559480312315e-05,
-      "loss": 1.7792,
-      "step": 1400
-    },
-    {
-      "epoch": 1.8007662835249043,
-      "grad_norm": 2.7696609497070312,
-      "learning_rate": 6.317128290745699e-05,
-      "loss": 1.7941,
-      "step": 1410
-    },
-    {
-      "epoch": 1.813537675606641,
-      "grad_norm": 2.7425568103790283,
-      "learning_rate": 6.302559798320566e-05,
-      "loss": 1.7169,
-      "step": 1420
-    },
-    {
-      "epoch": 1.8263090676883782,
-      "grad_norm": 3.130302906036377,
-      "learning_rate": 6.287854713084686e-05,
-      "loss": 1.7979,
-      "step": 1430
-    },
-    {
-      "epoch": 1.839080459770115,
-      "grad_norm": 2.8706188201904297,
-      "learning_rate": 6.273013751743166e-05,
-      "loss": 1.7407,
-      "step": 1440
-    },
-    {
-      "epoch": 1.8518518518518519,
-      "grad_norm": 3.3041722774505615,
-      "learning_rate": 6.258037637623526e-05,
-      "loss": 1.7932,
-      "step": 1450
-    },
-    {
-      "epoch": 1.8646232439335888,
-      "grad_norm": 3.403945207595825,
-      "learning_rate": 6.242927100640439e-05,
-      "loss": 1.7524,
-      "step": 1460
-    },
-    {
-      "epoch": 1.8773946360153255,
-      "grad_norm": 3.074903964996338,
-      "learning_rate": 6.22768287726016e-05,
-      "loss": 1.8342,
-      "step": 1470
-    },
-    {
-      "epoch": 1.8901660280970627,
-      "grad_norm": 2.789820671081543,
-      "learning_rate": 6.212305710464628e-05,
-      "loss": 1.7446,
-      "step": 1480
-    },
-    {
-      "epoch": 1.9029374201787994,
-      "grad_norm": 3.0101871490478516,
-      "learning_rate": 6.196796349715262e-05,
-      "loss": 1.7563,
-      "step": 1490
-    },
-    {
-      "epoch": 1.9157088122605364,
-      "grad_norm": 2.9494054317474365,
-      "learning_rate": 6.181155550916422e-05,
-      "loss": 1.7892,
-      "step": 1500
-    },
-    {
-      "epoch": 1.9157088122605364,
-      "eval_loss": 1.9544912576675415,
-      "eval_runtime": 26.8049,
-      "eval_samples_per_second": 18.653,
-      "eval_steps_per_second": 18.653,
-      "step": 1500
-    },
-    {
-      "epoch": 1.9284802043422733,
-      "grad_norm": 2.912105083465576,
-      "learning_rate": 6.165384076378578e-05,
-      "loss": 1.7858,
-      "step": 1510
-    },
-    {
-      "epoch": 1.9412515964240102,
-      "grad_norm": 3.1038968563079834,
-      "learning_rate": 6.149482694781147e-05,
-      "loss": 1.8045,
-      "step": 1520
-    },
-    {
-      "epoch": 1.9540229885057472,
-      "grad_norm": 3.214099407196045,
-      "learning_rate": 6.133452181135035e-05,
-      "loss": 1.771,
-      "step": 1530
-    },
-    {
-      "epoch": 1.966794380587484,
-      "grad_norm": 2.9663312435150146,
-      "learning_rate": 6.117293316744862e-05,
-      "loss": 1.7892,
-      "step": 1540
-    },
-    {
-      "epoch": 1.979565772669221,
-      "grad_norm": 2.9719676971435547,
-      "learning_rate": 6.101006889170879e-05,
-      "loss": 1.773,
-      "step": 1550
-    },
-    {
-      "epoch": 1.9923371647509578,
-      "grad_norm": 3.2106828689575195,
-      "learning_rate": 6.0845936921905935e-05,
-      "loss": 1.7839,
-      "step": 1560
-    },
-    {
-      "epoch": 2.005108556832695,
-      "grad_norm": 2.85646390914917,
-      "learning_rate": 6.068054525760066e-05,
-      "loss": 1.6438,
-      "step": 1570
-    },
-    {
-      "epoch": 2.0178799489144317,
-      "grad_norm": 2.613004684448242,
-      "learning_rate": 6.0513901959749396e-05,
-      "loss": 1.594,
-      "step": 1580
-    },
-    {
-      "epoch": 2.0306513409961684,
-      "grad_norm": 2.919440984725952,
-      "learning_rate": 6.0346015150311366e-05,
-      "loss": 1.6577,
-      "step": 1590
-    },
-    {
-      "epoch": 2.0434227330779056,
-      "grad_norm": 2.744115114212036,
-      "learning_rate": 6.017689301185278e-05,
-      "loss": 1.6052,
-      "step": 1600
-    },
-    {
-      "epoch": 2.0561941251596423,
-      "grad_norm": 2.954751491546631,
-      "learning_rate": 6.000654378714811e-05,
-      "loss": 1.6132,
-      "step": 1610
-    },
-    {
-      "epoch": 2.0689655172413794,
-      "grad_norm": 2.783085823059082,
-      "learning_rate": 5.983497577877823e-05,
-      "loss": 1.6281,
-      "step": 1620
-    },
-    {
-      "epoch": 2.081736909323116,
-      "grad_norm": 2.958265781402588,
-      "learning_rate": 5.966219734872581e-05,
-      "loss": 1.5537,
-      "step": 1630
-    },
-    {
-      "epoch": 2.0945083014048533,
-      "grad_norm": 2.761991262435913,
-      "learning_rate": 5.9488216917967784e-05,
-      "loss": 1.6174,
-      "step": 1640
-    },
-    {
-      "epoch": 2.10727969348659,
-      "grad_norm": 2.8865253925323486,
-      "learning_rate": 5.9313042966064896e-05,
-      "loss": 1.6407,
-      "step": 1650
-    },
-    {
-      "epoch": 2.10727969348659,
-      "eval_loss": 1.9608972072601318,
-      "eval_runtime": 26.9926,
-      "eval_samples_per_second": 18.524,
-      "eval_steps_per_second": 18.524,
-      "step": 1650
-    },
-    {
-      "epoch": 2.1200510855683268,
-      "grad_norm": 2.831836700439453,
-      "learning_rate": 5.9136684030748436e-05,
-      "loss": 1.6225,
-      "step": 1660
-    },
-    {
-      "epoch": 2.132822477650064,
-      "grad_norm": 2.761368751525879,
-      "learning_rate": 5.89591487075041e-05,
-      "loss": 1.6014,
-      "step": 1670
-    },
-    {
-      "epoch": 2.1455938697318007,
-      "grad_norm": 2.7214226722717285,
-      "learning_rate": 5.8780445649153075e-05,
-      "loss": 1.6009,
-      "step": 1680
-    },
-    {
-      "epoch": 2.158365261813538,
-      "grad_norm": 3.010241746902466,
-      "learning_rate": 5.860058356543031e-05,
-      "loss": 1.5901,
-      "step": 1690
-    },
-    {
-      "epoch": 2.1711366538952745,
-      "grad_norm": 3.019549608230591,
-      "learning_rate": 5.8419571222560034e-05,
-      "loss": 1.5947,
-      "step": 1700
-    },
-    {
-      "epoch": 2.1839080459770113,
-      "grad_norm": 2.638648509979248,
-      "learning_rate": 5.823741744282845e-05,
-      "loss": 1.566,
-      "step": 1710
-    },
-    {
-      "epoch": 2.1966794380587484,
-      "grad_norm": 2.9191718101501465,
-      "learning_rate": 5.805413110415381e-05,
-      "loss": 1.5797,
-      "step": 1720
-    },
-    {
-      "epoch": 2.209450830140485,
-      "grad_norm": 3.0825226306915283,
-      "learning_rate": 5.786972113965369e-05,
-      "loss": 1.6247,
-      "step": 1730
-    },
-    {
-      "epoch": 2.2222222222222223,
-      "grad_norm": 3.186307191848755,
-      "learning_rate": 5.7684196537209574e-05,
-      "loss": 1.6044,
-      "step": 1740
-    },
-    {
-      "epoch": 2.234993614303959,
-      "grad_norm": 2.94689679145813,
-      "learning_rate": 5.749756633902887e-05,
-      "loss": 1.5743,
-      "step": 1750
-    },
-    {
-      "epoch": 2.247765006385696,
-      "grad_norm": 3.0586421489715576,
-      "learning_rate": 5.7309839641204136e-05,
-      "loss": 1.6237,
-      "step": 1760
-    },
-    {
-      "epoch": 2.260536398467433,
-      "grad_norm": 3.0845377445220947,
-      "learning_rate": 5.7121025593269777e-05,
-      "loss": 1.6194,
-      "step": 1770
-    },
-    {
-      "epoch": 2.27330779054917,
-      "grad_norm": 3.0147876739501953,
-      "learning_rate": 5.693113339775611e-05,
-      "loss": 1.5865,
-      "step": 1780
-    },
-    {
-      "epoch": 2.286079182630907,
-      "grad_norm": 3.1859536170959473,
-      "learning_rate": 5.674017230974085e-05,
-      "loss": 1.6086,
-      "step": 1790
-    },
-    {
-      "epoch": 2.2988505747126435,
-      "grad_norm": 3.106766939163208,
-      "learning_rate": 5.654815163639803e-05,
-      "loss": 1.5977,
-      "step": 1800
-    },
-    {
-      "epoch": 2.2988505747126435,
-      "eval_loss": 1.9625232219696045,
-      "eval_runtime": 28.5711,
-      "eval_samples_per_second": 17.5,
-      "eval_steps_per_second": 17.5,
-      "step": 1800
-    },
-    {
-      "epoch": 2.3116219667943807,
-      "grad_norm": 2.9333529472351074,
-      "learning_rate": 5.654815163639803e-07,
-      "loss": 1.5856,
-      "step": 1810
-    },
-    {
-      "epoch": 2.3243933588761174,
-      "grad_norm": 2.9890449047088623,
-      "learning_rate": 1.1309630327279607e-06,
-      "loss": 1.584,
-      "step": 1820
-    },
-    {
-      "epoch": 2.3371647509578546,
-      "grad_norm": 3.300159215927124,
-      "learning_rate": 1.6964445490919409e-06,
-      "loss": 1.5921,
-      "step": 1830
-    },
-    {
-      "epoch": 2.3499361430395913,
-      "grad_norm": 3.0725886821746826,
-      "learning_rate": 2.2619260654559213e-06,
-      "loss": 1.5305,
-      "step": 1840
-    },
-    {
-      "epoch": 2.362707535121328,
-      "grad_norm": 2.9211957454681396,
-      "learning_rate": 2.8274075818199017e-06,
-      "loss": 1.6247,
-      "step": 1850
-    },
-    {
-      "epoch": 2.375478927203065,
-      "grad_norm": 2.942033052444458,
-      "learning_rate": 3.3928890981838817e-06,
-      "loss": 1.5717,
-      "step": 1860
-    },
-    {
-      "epoch": 2.388250319284802,
-      "grad_norm": 2.8808839321136475,
-      "learning_rate": 3.958370614547863e-06,
-      "loss": 1.606,
-      "step": 1870
-    },
-    {
-      "epoch": 2.401021711366539,
-      "grad_norm": 2.9812419414520264,
-      "learning_rate": 4.523852130911843e-06,
-      "loss": 1.5825,
-      "step": 1880
-    },
-    {
-      "epoch": 2.413793103448276,
-      "grad_norm": 3.1213929653167725,
-      "learning_rate": 5.089333647275823e-06,
-      "loss": 1.6172,
-      "step": 1890
-    },
-    {
-      "epoch": 2.4265644955300125,
-      "grad_norm": 2.865854501724243,
-      "learning_rate": 5.6548151636398035e-06,
-      "loss": 1.6028,
-      "step": 1900
-    },
-    {
-      "epoch": 2.4393358876117497,
-      "grad_norm": 3.07641863822937,
-      "learning_rate": 6.220296680003784e-06,
-      "loss": 1.6674,
-      "step": 1910
-    },
-    {
-      "epoch": 2.4521072796934864,
-      "grad_norm": 2.7488229274749756,
-      "learning_rate": 6.7857781963677635e-06,
-      "loss": 1.5994,
-      "step": 1920
-    },
-    {
-      "epoch": 2.4648786717752236,
-      "grad_norm": 2.993471384048462,
-      "learning_rate": 7.351259712731745e-06,
-      "loss": 1.6039,
-      "step": 1930
-    },
-    {
-      "epoch": 2.4776500638569603,
-      "grad_norm": 2.7299869060516357,
-      "learning_rate": 7.916741229095726e-06,
-      "loss": 1.5046,
-      "step": 1940
-    },
-    {
-      "epoch": 2.4904214559386975,
-      "grad_norm": 3.036618947982788,
-      "learning_rate": 8.482222745459704e-06,
-      "loss": 1.6052,
-      "step": 1950
-    },
-    {
-      "epoch": 2.4904214559386975,
-      "eval_loss": 1.9600526094436646,
-      "eval_runtime": 26.6264,
-      "eval_samples_per_second": 18.778,
-      "eval_steps_per_second": 18.778,
-      "step": 1950
-    },
-    {
-      "epoch": 2.503192848020434,
-      "grad_norm": 3.1929354667663574,
-      "learning_rate": 9.047704261823685e-06,
-      "loss": 1.632,
-      "step": 1960
-    },
-    {
-      "epoch": 2.5159642401021713,
-      "grad_norm": 3.241421699523926,
-      "learning_rate": 9.613185778187667e-06,
-      "loss": 1.5757,
-      "step": 1970
-    },
-    {
-      "epoch": 2.528735632183908,
-      "grad_norm": 2.8949759006500244,
-      "learning_rate": 1.0178667294551646e-05,
-      "loss": 1.6781,
-      "step": 1980
-    },
-    {
-      "epoch": 2.541507024265645,
-      "grad_norm": 2.983260154724121,
-      "learning_rate": 1.0744148810915626e-05,
-      "loss": 1.6403,
-      "step": 1990
-    },
-    {
-      "epoch": 2.554278416347382,
-      "grad_norm": 3.2770559787750244,
-      "learning_rate": 1.1309630327279607e-05,
-      "loss": 1.6642,
-      "step": 2000
-    },
-    {
-      "epoch": 2.5670498084291187,
-      "grad_norm": 2.995584487915039,
-      "learning_rate": 1.1875111843643587e-05,
-      "loss": 1.6282,
-      "step": 2010
-    },
-    {
-      "epoch": 2.579821200510856,
-      "grad_norm": 3.113140106201172,
-      "learning_rate": 1.2440593360007568e-05,
-      "loss": 1.5997,
-      "step": 2020
-    },
-    {
-      "epoch": 2.5925925925925926,
-      "grad_norm": 2.9088339805603027,
-      "learning_rate": 1.3006074876371547e-05,
-      "loss": 1.6134,
-      "step": 2030
-    },
-    {
-      "epoch": 2.6053639846743293,
-      "grad_norm": 2.9703259468078613,
-      "learning_rate": 1.3571556392735527e-05,
-      "loss": 1.5381,
-      "step": 2040
-    },
-    {
-      "epoch": 2.6181353767560664,
-      "grad_norm": 3.0287814140319824,
-      "learning_rate": 1.4137037909099508e-05,
-      "loss": 1.6708,
-      "step": 2050
-    },
-    {
-      "epoch": 2.630906768837803,
-      "grad_norm": 3.0678138732910156,
-      "learning_rate": 1.470251942546349e-05,
-      "loss": 1.6232,
-      "step": 2060
-    },
-    {
-      "epoch": 2.6436781609195403,
-      "grad_norm": 2.8624658584594727,
-      "learning_rate": 1.526800094182747e-05,
-      "loss": 1.5495,
-      "step": 2070
-    },
-    {
-      "epoch": 2.656449553001277,
-      "grad_norm": 3.0500433444976807,
-      "learning_rate": 1.5833482458191452e-05,
-      "loss": 1.5992,
-      "step": 2080
-    },
-    {
-      "epoch": 2.6692209450830138,
-      "grad_norm": 2.9770045280456543,
-      "learning_rate": 1.639896397455543e-05,
-      "loss": 1.6213,
-      "step": 2090
-    },
-    {
-      "epoch": 2.681992337164751,
-      "grad_norm": 3.110860586166382,
-      "learning_rate": 1.6964445490919408e-05,
-      "loss": 1.5852,
-      "step": 2100
-    },
-    {
-      "epoch": 2.681992337164751,
-      "eval_loss": 1.957663893699646,
-      "eval_runtime": 26.8577,
-      "eval_samples_per_second": 18.617,
-      "eval_steps_per_second": 18.617,
-      "step": 2100
-    },
-    {
-      "epoch": 2.694763729246488,
-      "grad_norm": 2.987915515899658,
-      "learning_rate": 1.752992700728339e-05,
-      "loss": 1.613,
-      "step": 2110
-    },
-    {
-      "epoch": 2.707535121328225,
-      "grad_norm": 3.1154093742370605,
-      "learning_rate": 1.809540852364737e-05,
-      "loss": 1.6002,
-      "step": 2120
-    },
-    {
-      "epoch": 2.7203065134099615,
-      "grad_norm": 3.175532817840576,
-      "learning_rate": 1.8660890040011352e-05,
-      "loss": 1.5809,
-      "step": 2130
-    },
-    {
-      "epoch": 2.7330779054916987,
-      "grad_norm": 2.9489593505859375,
-      "learning_rate": 1.9226371556375333e-05,
-      "loss": 1.5821,
-      "step": 2140
-    },
-    {
-      "epoch": 2.7458492975734354,
-      "grad_norm": 2.855994939804077,
-      "learning_rate": 1.979185307273931e-05,
-      "loss": 1.5858,
-      "step": 2150
-    },
-    {
-      "epoch": 2.7586206896551726,
-      "grad_norm": 2.9239962100982666,
-      "learning_rate": 2.0357334589103292e-05,
-      "loss": 1.5816,
-      "step": 2160
-    },
-    {
-      "epoch": 2.7713920817369093,
-      "grad_norm": 2.9092483520507812,
-      "learning_rate": 2.0922816105467273e-05,
-      "loss": 1.6278,
-      "step": 2170
-    },
-    {
-      "epoch": 2.784163473818646,
-      "grad_norm": 3.0730059146881104,
-      "learning_rate": 2.148829762183125e-05,
-      "loss": 1.587,
-      "step": 2180
-    },
-    {
-      "epoch": 2.796934865900383,
-      "grad_norm": 2.9733469486236572,
-      "learning_rate": 2.2053779138195233e-05,
-      "loss": 1.5931,
-      "step": 2190
-    },
-    {
-      "epoch": 2.80970625798212,
-      "grad_norm": 2.882889986038208,
-      "learning_rate": 2.2619260654559214e-05,
-      "loss": 1.6206,
-      "step": 2200
-    },
-    {
-      "epoch": 2.822477650063857,
-      "grad_norm": 3.2219135761260986,
-      "learning_rate": 2.3184742170923192e-05,
-      "loss": 1.5782,
-      "step": 2210
-    },
-    {
-      "epoch": 2.835249042145594,
-      "grad_norm": 3.1879076957702637,
-      "learning_rate": 2.3750223687287173e-05,
-      "loss": 1.6398,
-      "step": 2220
-    },
-    {
-      "epoch": 2.8480204342273305,
-      "grad_norm": 3.0046067237854004,
-      "learning_rate": 2.4315705203651154e-05,
-      "loss": 1.6471,
-      "step": 2230
-    },
-    {
-      "epoch": 2.8607918263090677,
-      "grad_norm": 2.837573766708374,
-      "learning_rate": 2.4881186720015136e-05,
-      "loss": 1.5785,
-      "step": 2240
-    },
-    {
-      "epoch": 2.873563218390805,
-      "grad_norm": 2.998067855834961,
-      "learning_rate": 2.5446668236379117e-05,
-      "loss": 1.6123,
-      "step": 2250
-    },
-    {
-      "epoch": 2.873563218390805,
-      "eval_loss": 1.9578022956848145,
-      "eval_runtime": 27.614,
-      "eval_samples_per_second": 18.107,
-      "eval_steps_per_second": 18.107,
-      "step": 2250
-    },
-    {
-      "epoch": 2.8863346104725416,
-      "grad_norm": 3.0444719791412354,
-      "learning_rate": 2.6012149752743095e-05,
-      "loss": 1.5877,
-      "step": 2260
-    },
-    {
-      "epoch": 2.8991060025542783,
-      "grad_norm": 3.109495162963867,
-      "learning_rate": 2.6577631269107073e-05,
-      "loss": 1.6581,
-      "step": 2270
-    },
-    {
-      "epoch": 2.9118773946360155,
-      "grad_norm": 2.8879830837249756,
-      "learning_rate": 2.7143112785471054e-05,
-      "loss": 1.6034,
-      "step": 2280
-    },
-    {
-      "epoch": 2.924648786717752,
-      "grad_norm": 3.3881685733795166,
-      "learning_rate": 2.7708594301835035e-05,
-      "loss": 1.6445,
-      "step": 2290
-    },
-    {
-      "epoch": 2.9374201787994894,
-      "grad_norm": 2.869450330734253,
-      "learning_rate": 2.8274075818199017e-05,
-      "loss": 1.5976,
-      "step": 2300
-    },
-    {
-      "epoch": 2.950191570881226,
-      "grad_norm": 2.9405651092529297,
-      "learning_rate": 2.8273731308557363e-05,
-      "loss": 1.5806,
-      "step": 2310
-    },
-    {
-      "epoch": 2.962962962962963,
-      "grad_norm": 3.227057695388794,
-      "learning_rate": 2.8272697796423325e-05,
-      "loss": 1.6023,
-      "step": 2320
-    },
-    {
-      "epoch": 2.9757343550447,
-      "grad_norm": 3.0668394565582275,
-      "learning_rate": 2.8270975332168814e-05,
-      "loss": 1.5949,
-      "step": 2330
-    },
-    {
-      "epoch": 2.9885057471264367,
-      "grad_norm": 3.4752612113952637,
-      "learning_rate": 2.8268563999744315e-05,
-      "loss": 1.6142,
-      "step": 2340
-    },
-    {
-      "epoch": 3.001277139208174,
-      "grad_norm": 2.851846694946289,
-      "learning_rate": 2.8265463916674774e-05,
-      "loss": 1.6006,
-      "step": 2350
-    },
-    {
-      "epoch": 3.0140485312899106,
-      "grad_norm": 2.914916753768921,
-      "learning_rate": 2.8261675234053857e-05,
-      "loss": 1.5599,
-      "step": 2360
-    },
-    {
-      "epoch": 3.0268199233716473,
-      "grad_norm": 3.022381544113159,
-      "learning_rate": 2.825719813653661e-05,
-      "loss": 1.561,
-      "step": 2370
-    },
-    {
-      "epoch": 3.0395913154533845,
-      "grad_norm": 2.812488079071045,
-      "learning_rate": 2.8252032842330455e-05,
-      "loss": 1.5051,
-      "step": 2380
-    },
-    {
-      "epoch": 3.052362707535121,
-      "grad_norm": 2.879337787628174,
-      "learning_rate": 2.8246179603184542e-05,
-      "loss": 1.5499,
-      "step": 2390
-    },
-    {
-      "epoch": 3.0651340996168583,
-      "grad_norm": 3.1652944087982178,
-      "learning_rate": 2.823963870437749e-05,
-      "loss": 1.5814,
-      "step": 2400
-    },
-    {
-      "epoch": 3.0651340996168583,
-      "eval_loss": 1.9594498872756958,
-      "eval_runtime": 27.4191,
-      "eval_samples_per_second": 18.235,
-      "eval_steps_per_second": 18.235,
-      "step": 2400
-    },
-    {
-      "epoch": 3.077905491698595,
-      "grad_norm": 3.0777502059936523,
-      "learning_rate": 2.8239638704377494e-07,
-      "loss": 1.5266,
-      "step": 2410
-    },
-    {
-      "epoch": 3.0906768837803322,
-      "grad_norm": 2.999093770980835,
-      "learning_rate": 5.647927740875499e-07,
-      "loss": 1.5265,
-      "step": 2420
-    },
-    {
-      "epoch": 3.103448275862069,
-      "grad_norm": 3.03286075592041,
-      "learning_rate": 8.471891611313247e-07,
-      "loss": 1.5492,
-      "step": 2430
-    },
-    {
-      "epoch": 3.1162196679438057,
-      "grad_norm": 3.1120502948760986,
-      "learning_rate": 1.1295855481750998e-06,
-      "loss": 1.5494,
-      "step": 2440
-    },
-    {
-      "epoch": 3.128991060025543,
-      "grad_norm": 3.289019823074341,
-      "learning_rate": 1.4119819352188746e-06,
-      "loss": 1.5846,
-      "step": 2450
-    },
-    {
-      "epoch": 3.1417624521072796,
-      "grad_norm": 2.9034361839294434,
-      "learning_rate": 1.6943783222626494e-06,
-      "loss": 1.553,
-      "step": 2460
-    },
-    {
-      "epoch": 3.1545338441890167,
-      "grad_norm": 3.0615646839141846,
-      "learning_rate": 1.9767747093064247e-06,
-      "loss": 1.5219,
-      "step": 2470
-    },
-    {
-      "epoch": 3.1673052362707534,
-      "grad_norm": 2.944612741470337,
-      "learning_rate": 2.2591710963501995e-06,
-      "loss": 1.5853,
-      "step": 2480
-    },
-    {
-      "epoch": 3.1800766283524906,
-      "grad_norm": 2.9914486408233643,
-      "learning_rate": 2.5415674833939743e-06,
-      "loss": 1.5752,
-      "step": 2490
-    },
-    {
-      "epoch": 3.1928480204342273,
-      "grad_norm": 2.6826584339141846,
-      "learning_rate": 2.823963870437749e-06,
-      "loss": 1.5066,
-      "step": 2500
-    },
-    {
-      "epoch": 3.205619412515964,
-      "grad_norm": 3.1901533603668213,
-      "learning_rate": 3.106360257481524e-06,
-      "loss": 1.5461,
-      "step": 2510
-    },
-    {
-      "epoch": 3.218390804597701,
-      "grad_norm": 3.0013110637664795,
-      "learning_rate": 3.388756644525299e-06,
-      "loss": 1.5591,
-      "step": 2520
-    },
-    {
-      "epoch": 3.231162196679438,
-      "grad_norm": 3.266486406326294,
-      "learning_rate": 3.671153031569074e-06,
-      "loss": 1.5211,
-      "step": 2530
-    },
-    {
-      "epoch": 3.243933588761175,
-      "grad_norm": 2.939565896987915,
-      "learning_rate": 3.953549418612849e-06,
-      "loss": 1.5392,
-      "step": 2540
-    },
-    {
-      "epoch": 3.256704980842912,
-      "grad_norm": 2.8810532093048096,
-      "learning_rate": 4.235945805656624e-06,
-      "loss": 1.5439,
-      "step": 2550
-    },
-    {
-      "epoch": 3.256704980842912,
-      "eval_loss": 1.9594852924346924,
-      "eval_runtime": 27.7206,
-      "eval_samples_per_second": 18.037,
-      "eval_steps_per_second": 18.037,
-      "step": 2550
     }
   ],
   "logging_steps": 10,
@@ -1947,7 +139,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 4.48659005668393e+16,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null

 {
+  "best_metric": 1.9780572652816772,
+  "best_model_checkpoint": "./output/checkpoint-150",
+  "epoch": 0.19157088122605365,
   "eval_steps": 150,
+  "global_step": 150,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.01277139208173691,
+      "grad_norm": 2.1011273860931396,
+      "learning_rate": 4.666666666666666e-06,
+      "loss": 1.9177,
       "step": 10
     },
     {
       "epoch": 0.02554278416347382,
+      "grad_norm": 2.0189812183380127,
+      "learning_rate": 9.333333333333333e-06,
+      "loss": 1.9419,
       "step": 20
     },
     {
       "epoch": 0.038314176245210725,
+      "grad_norm": 2.073760509490967,
+      "learning_rate": 1.4e-05,
+      "loss": 1.9122,
       "step": 30
     },
     {
       "epoch": 0.05108556832694764,
+      "grad_norm": 1.955664873123169,
+      "learning_rate": 1.8666666666666665e-05,
+      "loss": 1.8872,
       "step": 40
     },
     {
       "epoch": 0.06385696040868455,
+      "grad_norm": 2.6010475158691406,
+      "learning_rate": 2.333333333333333e-05,
+      "loss": 1.9779,
       "step": 50
     },
     {
       "epoch": 0.07662835249042145,
+      "grad_norm": 2.0808260440826416,
+      "learning_rate": 2.8e-05,
+      "loss": 1.933,
       "step": 60
     },
     {
       "epoch": 0.08939974457215837,
+      "grad_norm": 1.969761848449707,
+      "learning_rate": 3.266666666666666e-05,
+      "loss": 1.946,
       "step": 70
     },
     {
       "epoch": 0.10217113665389528,
+      "grad_norm": 2.136836290359497,
+      "learning_rate": 3.733333333333333e-05,
+      "loss": 1.9441,
       "step": 80
     },
     {
       "epoch": 0.11494252873563218,
+      "grad_norm": 2.056912899017334,
+      "learning_rate": 4.2e-05,
+      "loss": 1.9262,
       "step": 90
     },
     {
       "epoch": 0.1277139208173691,
+      "grad_norm": 2.1491384506225586,
+      "learning_rate": 4.666666666666666e-05,
+      "loss": 1.9775,
       "step": 100
     },
     {
       "epoch": 0.140485312899106,
+      "grad_norm": 1.8882553577423096,
+      "learning_rate": 5.1333333333333325e-05,
+      "loss": 1.9233,
       "step": 110
     },
     {
       "epoch": 0.1532567049808429,
+      "grad_norm": 2.0507898330688477,
+      "learning_rate": 5.6e-05,
+      "loss": 1.9408,
       "step": 120
     },
     {
       "epoch": 0.16602809706257982,
+      "grad_norm": 2.2763912677764893,
+      "learning_rate": 6.0666666666666666e-05,
+      "loss": 1.9429,
       "step": 130
     },
     {
       "epoch": 0.17879948914431673,
+      "grad_norm": 2.1040444374084473,
+      "learning_rate": 6.533333333333333e-05,
+      "loss": 1.9193,
       "step": 140
     },
     {
       "epoch": 0.19157088122605365,
+      "grad_norm": 2.0764999389648438,
+      "learning_rate": 7e-05,
+      "loss": 1.9405,
       "step": 150
     },
     {
       "epoch": 0.19157088122605365,
+      "eval_loss": 1.9780572652816772,
+      "eval_runtime": 24.3012,
+      "eval_samples_per_second": 20.575,
+      "eval_steps_per_second": 20.575,
       "step": 150
     }
   ],
   "logging_steps": 10,
       "attributes": {}
     }
   },
+  "total_flos": 2084245000224768.0,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d858d0509d8d3f066b40d4d22294b620c54719ac51743859316dd522404d30f7
 size 5496

 version https://git-lfs.github.com/spec/v1
+oid sha256:73d63faa96a8013f73d4d225b5f62be5f6f1a8819c12a7d65e93c26570162b6b
 size 5496