Training in progress, step 150, checkpoint

Browse files

Files changed (7) hide show

last-checkpoint/adapter_config.json +3 -3
last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +36 -1505
last-checkpoint/training_args.bin +1 -1

last-checkpoint/adapter_config.json CHANGED Viewed

@@ -20,10 +20,10 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "k_proj",
     "o_proj",
-    "q_proj",
-    "v_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "v_proj",
     "o_proj",
+    "k_proj",
+    "q_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:801aa7b997a7e4b8dcd981fd60baee3ecd664f54dc8c3da10e0115165f077998
 size 6832520

 version https://git-lfs.github.com/spec/v1
+oid sha256:d22559dc9f3539c34081db29203f0c0625e53a718119282ae5e9cbbbbfea04ad
 size 6832520

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d851dd04f635ce70c88f6498f39d69d87ca59d3d81be6b72bdecd25c393ab3a9
 size 13739450

 version https://git-lfs.github.com/spec/v1
+oid sha256:065d4ede74e3982d45eabd75bcf3791c1732814b3c2687bdef88b71c498b41db
 size 13739450

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1d8b738ca9450d433fb2c95bc254c194b583c6dab018288271e33a0fa05a0406
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:ba79f79f4200644bcde298b8ba358af98910b10cc152e720addca023a2e47a37
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0fc647c62a4029a5bbab67166d420aae87372bfe059358b28e21b340b6d55085
 size 1256

 version https://git-lfs.github.com/spec/v1
+oid sha256:6186f1e9836e20fd270e1ab773c83f1027d92e426fd1d0a8c7816f8a9115c5fd
 size 1256

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,1594 +1,125 @@
 {
-  "best_metric": 1.9337016344070435,
-  "best_model_checkpoint": "./output/checkpoint-1500",
-  "epoch": 2.681992337164751,
   "eval_steps": 150,
-  "global_step": 2100,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.01277139208173691,
-      "grad_norm": 2.1011273860931396,
       "learning_rate": 4.666666666666666e-06,
-      "loss": 1.9177,
       "step": 10
     },
     {
       "epoch": 0.02554278416347382,
-      "grad_norm": 2.0189812183380127,
       "learning_rate": 9.333333333333333e-06,
-      "loss": 1.9419,
       "step": 20
     },
     {
       "epoch": 0.038314176245210725,
-      "grad_norm": 2.073760509490967,
       "learning_rate": 1.4e-05,
-      "loss": 1.9122,
       "step": 30
     },
     {
       "epoch": 0.05108556832694764,
-      "grad_norm": 1.955664873123169,
       "learning_rate": 1.8666666666666665e-05,
-      "loss": 1.8872,
       "step": 40
     },
     {
       "epoch": 0.06385696040868455,
-      "grad_norm": 2.6010475158691406,
       "learning_rate": 2.333333333333333e-05,
       "loss": 1.9779,
       "step": 50
     },
     {
       "epoch": 0.07662835249042145,
-      "grad_norm": 2.0808260440826416,
       "learning_rate": 2.8e-05,
-      "loss": 1.933,
       "step": 60
     },
     {
       "epoch": 0.08939974457215837,
-      "grad_norm": 1.969761848449707,
       "learning_rate": 3.266666666666666e-05,
-      "loss": 1.946,
       "step": 70
     },
     {
       "epoch": 0.10217113665389528,
-      "grad_norm": 2.136836290359497,
       "learning_rate": 3.733333333333333e-05,
-      "loss": 1.9441,
       "step": 80
     },
     {
       "epoch": 0.11494252873563218,
-      "grad_norm": 2.056912899017334,
       "learning_rate": 4.2e-05,
-      "loss": 1.9262,
       "step": 90
     },
     {
       "epoch": 0.1277139208173691,
-      "grad_norm": 2.1491384506225586,
       "learning_rate": 4.666666666666666e-05,
-      "loss": 1.9775,
       "step": 100
     },
     {
       "epoch": 0.140485312899106,
-      "grad_norm": 1.8882553577423096,
       "learning_rate": 5.1333333333333325e-05,
       "loss": 1.9233,
       "step": 110
     },
     {
       "epoch": 0.1532567049808429,
-      "grad_norm": 2.0507898330688477,
       "learning_rate": 5.6e-05,
-      "loss": 1.9408,
       "step": 120
     },
     {
       "epoch": 0.16602809706257982,
-      "grad_norm": 2.2763912677764893,
       "learning_rate": 6.0666666666666666e-05,
-      "loss": 1.9429,
       "step": 130
     },
     {
       "epoch": 0.17879948914431673,
-      "grad_norm": 2.1040444374084473,
       "learning_rate": 6.533333333333333e-05,
-      "loss": 1.9193,
       "step": 140
     },
     {
       "epoch": 0.19157088122605365,
-      "grad_norm": 2.0764999389648438,
       "learning_rate": 7e-05,
       "loss": 1.9405,
       "step": 150
     },
     {
       "epoch": 0.19157088122605365,
-      "eval_loss": 1.9780572652816772,
-      "eval_runtime": 24.3012,
-      "eval_samples_per_second": 20.575,
-      "eval_steps_per_second": 20.575,
       "step": 150
-    },
-    {
-      "epoch": 0.20434227330779056,
-      "grad_norm": 2.221877336502075,
-      "learning_rate": 6.999926573504895e-05,
-      "loss": 1.881,
-      "step": 160
-    },
-    {
-      "epoch": 0.21711366538952745,
-      "grad_norm": 2.2773871421813965,
-      "learning_rate": 6.999706297100412e-05,
-      "loss": 1.9073,
-      "step": 170
-    },
-    {
-      "epoch": 0.22988505747126436,
-      "grad_norm": 2.2678256034851074,
-      "learning_rate": 6.999339180028904e-05,
-      "loss": 1.8844,
-      "step": 180
-    },
-    {
-      "epoch": 0.24265644955300128,
-      "grad_norm": 2.0379583835601807,
-      "learning_rate": 6.99882523769387e-05,
-      "loss": 1.9496,
-      "step": 190
-    },
-    {
-      "epoch": 0.2554278416347382,
-      "grad_norm": 2.1946399211883545,
-      "learning_rate": 6.998164491659302e-05,
-      "loss": 1.9193,
-      "step": 200
-    },
-    {
-      "epoch": 0.2681992337164751,
-      "grad_norm": 2.218657970428467,
-      "learning_rate": 6.997356969648778e-05,
-      "loss": 1.834,
-      "step": 210
-    },
-    {
-      "epoch": 0.280970625798212,
-      "grad_norm": 2.1720638275146484,
-      "learning_rate": 6.996402705544307e-05,
-      "loss": 1.9135,
-      "step": 220
-    },
-    {
-      "epoch": 0.2937420178799489,
-      "grad_norm": 2.029083251953125,
-      "learning_rate": 6.995301739384896e-05,
-      "loss": 1.9661,
-      "step": 230
-    },
-    {
-      "epoch": 0.3065134099616858,
-      "grad_norm": 1.9260584115982056,
-      "learning_rate": 6.994054117364885e-05,
-      "loss": 1.9046,
-      "step": 240
-    },
-    {
-      "epoch": 0.31928480204342274,
-      "grad_norm": 2.0648281574249268,
-      "learning_rate": 6.992659891831991e-05,
-      "loss": 1.9672,
-      "step": 250
-    },
-    {
-      "epoch": 0.33205619412515963,
-      "grad_norm": 1.964780330657959,
-      "learning_rate": 6.991119121285126e-05,
-      "loss": 1.9125,
-      "step": 260
-    },
-    {
-      "epoch": 0.3448275862068966,
-      "grad_norm": 2.132140636444092,
-      "learning_rate": 6.989431870371936e-05,
-      "loss": 1.9624,
-      "step": 270
-    },
-    {
-      "epoch": 0.35759897828863346,
-      "grad_norm": 2.0243074893951416,
-      "learning_rate": 6.987598209886091e-05,
-      "loss": 1.9409,
-      "step": 280
-    },
-    {
-      "epoch": 0.37037037037037035,
-      "grad_norm": 1.9534668922424316,
-      "learning_rate": 6.985618216764314e-05,
-      "loss": 1.9589,
-      "step": 290
-    },
-    {
-      "epoch": 0.3831417624521073,
-      "grad_norm": 1.9132697582244873,
-      "learning_rate": 6.983491974083148e-05,
-      "loss": 1.9467,
-      "step": 300
-    },
-    {
-      "epoch": 0.3831417624521073,
-      "eval_loss": 1.962647795677185,
-      "eval_runtime": 24.5626,
-      "eval_samples_per_second": 20.356,
-      "eval_steps_per_second": 20.356,
-      "step": 300
-    },
-    {
-      "epoch": 0.3959131545338442,
-      "grad_norm": 2.107532262802124,
-      "learning_rate": 6.981219571055479e-05,
-      "loss": 1.8887,
-      "step": 310
-    },
-    {
-      "epoch": 0.4086845466155811,
-      "grad_norm": 1.9190975427627563,
-      "learning_rate": 6.978801103026786e-05,
-      "loss": 1.9321,
-      "step": 320
-    },
-    {
-      "epoch": 0.421455938697318,
-      "grad_norm": 2.313232421875,
-      "learning_rate": 6.976236671471145e-05,
-      "loss": 1.9673,
-      "step": 330
-    },
-    {
-      "epoch": 0.4342273307790549,
-      "grad_norm": 2.028118848800659,
-      "learning_rate": 6.973526383986968e-05,
-      "loss": 1.8951,
-      "step": 340
-    },
-    {
-      "epoch": 0.44699872286079184,
-      "grad_norm": 1.9303714036941528,
-      "learning_rate": 6.970670354292487e-05,
-      "loss": 1.888,
-      "step": 350
-    },
-    {
-      "epoch": 0.45977011494252873,
-      "grad_norm": 1.9469075202941895,
-      "learning_rate": 6.967668702220992e-05,
-      "loss": 1.9534,
-      "step": 360
-    },
-    {
-      "epoch": 0.4725415070242657,
-      "grad_norm": 1.9907073974609375,
-      "learning_rate": 6.964521553715788e-05,
-      "loss": 1.928,
-      "step": 370
-    },
-    {
-      "epoch": 0.48531289910600256,
-      "grad_norm": 1.9961628913879395,
-      "learning_rate": 6.961229040824927e-05,
-      "loss": 1.8961,
-      "step": 380
-    },
-    {
-      "epoch": 0.49808429118773945,
-      "grad_norm": 2.0501065254211426,
-      "learning_rate": 6.957791301695653e-05,
-      "loss": 1.8465,
-      "step": 390
-    },
-    {
-      "epoch": 0.5108556832694764,
-      "grad_norm": 2.014704704284668,
-      "learning_rate": 6.954208480568616e-05,
-      "loss": 1.9278,
-      "step": 400
-    },
-    {
-      "epoch": 0.5236270753512133,
-      "grad_norm": 1.8024693727493286,
-      "learning_rate": 6.950480727771816e-05,
-      "loss": 1.945,
-      "step": 410
-    },
-    {
-      "epoch": 0.5363984674329502,
-      "grad_norm": 2.1424338817596436,
-      "learning_rate": 6.946608199714291e-05,
-      "loss": 1.9117,
-      "step": 420
-    },
-    {
-      "epoch": 0.5491698595146871,
-      "grad_norm": 1.9119850397109985,
-      "learning_rate": 6.942591058879567e-05,
-      "loss": 1.9086,
-      "step": 430
-    },
-    {
-      "epoch": 0.561941251596424,
-      "grad_norm": 1.866670846939087,
-      "learning_rate": 6.938429473818823e-05,
-      "loss": 1.91,
-      "step": 440
-    },
-    {
-      "epoch": 0.5747126436781609,
-      "grad_norm": 1.9712364673614502,
-      "learning_rate": 6.934123619143835e-05,
-      "loss": 1.899,
-      "step": 450
-    },
-    {
-      "epoch": 0.5747126436781609,
-      "eval_loss": 1.9519777297973633,
-      "eval_runtime": 24.1832,
-      "eval_samples_per_second": 20.676,
-      "eval_steps_per_second": 20.676,
-      "step": 450
-    },
-    {
-      "epoch": 0.5874840357598978,
-      "grad_norm": 1.8985211849212646,
-      "learning_rate": 6.929673675519645e-05,
-      "loss": 1.8811,
-      "step": 460
-    },
-    {
-      "epoch": 0.6002554278416348,
-      "grad_norm": 1.8869976997375488,
-      "learning_rate": 6.92507982965697e-05,
-      "loss": 1.8547,
-      "step": 470
-    },
-    {
-      "epoch": 0.6130268199233716,
-      "grad_norm": 2.027930974960327,
-      "learning_rate": 6.920342274304384e-05,
-      "loss": 1.8605,
-      "step": 480
-    },
-    {
-      "epoch": 0.6257982120051085,
-      "grad_norm": 1.832667350769043,
-      "learning_rate": 6.915461208240223e-05,
-      "loss": 1.8656,
-      "step": 490
-    },
-    {
-      "epoch": 0.6385696040868455,
-      "grad_norm": 1.9933980703353882,
-      "learning_rate": 6.910436836264241e-05,
-      "loss": 1.9675,
-      "step": 500
-    },
-    {
-      "epoch": 0.6513409961685823,
-      "grad_norm": 1.8492072820663452,
-      "learning_rate": 6.905269369189023e-05,
-      "loss": 1.8948,
-      "step": 510
-    },
-    {
-      "epoch": 0.6641123882503193,
-      "grad_norm": 2.021852970123291,
-      "learning_rate": 6.899959023831139e-05,
-      "loss": 1.8755,
-      "step": 520
-    },
-    {
-      "epoch": 0.6768837803320562,
-      "grad_norm": 1.7901625633239746,
-      "learning_rate": 6.894506023002045e-05,
-      "loss": 1.8934,
-      "step": 530
-    },
-    {
-      "epoch": 0.6896551724137931,
-      "grad_norm": 1.9580366611480713,
-      "learning_rate": 6.888910595498735e-05,
-      "loss": 1.9229,
-      "step": 540
-    },
-    {
-      "epoch": 0.70242656449553,
-      "grad_norm": 1.8306083679199219,
-      "learning_rate": 6.883172976094139e-05,
-      "loss": 1.9469,
-      "step": 550
-    },
-    {
-      "epoch": 0.7151979565772669,
-      "grad_norm": 1.8871126174926758,
-      "learning_rate": 6.877293405527276e-05,
-      "loss": 1.914,
-      "step": 560
-    },
-    {
-      "epoch": 0.7279693486590039,
-      "grad_norm": 1.7971593141555786,
-      "learning_rate": 6.871272130493153e-05,
-      "loss": 1.8504,
-      "step": 570
-    },
-    {
-      "epoch": 0.7407407407407407,
-      "grad_norm": 1.9786813259124756,
-      "learning_rate": 6.86510940363241e-05,
-      "loss": 1.876,
-      "step": 580
-    },
-    {
-      "epoch": 0.7535121328224776,
-      "grad_norm": 1.9042707681655884,
-      "learning_rate": 6.858805483520723e-05,
-      "loss": 1.8948,
-      "step": 590
-    },
-    {
-      "epoch": 0.7662835249042146,
-      "grad_norm": 2.191734552383423,
-      "learning_rate": 6.852360634657953e-05,
-      "loss": 1.9266,
-      "step": 600
-    },
-    {
-      "epoch": 0.7662835249042146,
-      "eval_loss": 1.9498045444488525,
-      "eval_runtime": 24.164,
-      "eval_samples_per_second": 20.692,
-      "eval_steps_per_second": 20.692,
-      "step": 600
-    },
-    {
-      "epoch": 0.7790549169859514,
-      "grad_norm": 1.8927336931228638,
-      "learning_rate": 6.845775127457056e-05,
-      "loss": 1.8992,
-      "step": 610
-    },
-    {
-      "epoch": 0.7918263090676884,
-      "grad_norm": 2.0473830699920654,
-      "learning_rate": 6.839049238232719e-05,
-      "loss": 1.8489,
-      "step": 620
-    },
-    {
-      "epoch": 0.8045977011494253,
-      "grad_norm": 1.8225151300430298,
-      "learning_rate": 6.832183249189786e-05,
-      "loss": 1.923,
-      "step": 630
-    },
-    {
-      "epoch": 0.8173690932311622,
-      "grad_norm": 1.8365392684936523,
-      "learning_rate": 6.825177448411404e-05,
-      "loss": 1.8782,
-      "step": 640
-    },
-    {
-      "epoch": 0.8301404853128991,
-      "grad_norm": 1.8688030242919922,
-      "learning_rate": 6.818032129846945e-05,
-      "loss": 1.9063,
-      "step": 650
-    },
-    {
-      "epoch": 0.842911877394636,
-      "grad_norm": 1.7445991039276123,
-      "learning_rate": 6.810747593299666e-05,
-      "loss": 1.8548,
-      "step": 660
-    },
-    {
-      "epoch": 0.855683269476373,
-      "grad_norm": 1.7961305379867554,
-      "learning_rate": 6.803324144414127e-05,
-      "loss": 1.8504,
-      "step": 670
-    },
-    {
-      "epoch": 0.8684546615581098,
-      "grad_norm": 1.8798601627349854,
-      "learning_rate": 6.795762094663379e-05,
-      "loss": 1.9029,
-      "step": 680
-    },
-    {
-      "epoch": 0.8812260536398467,
-      "grad_norm": 1.9716835021972656,
-      "learning_rate": 6.788061761335882e-05,
-      "loss": 1.9092,
-      "step": 690
-    },
-    {
-      "epoch": 0.8939974457215837,
-      "grad_norm": 1.8877177238464355,
-      "learning_rate": 6.780223467522203e-05,
-      "loss": 1.9268,
-      "step": 700
-    },
-    {
-      "epoch": 0.9067688378033205,
-      "grad_norm": 2.017631769180298,
-      "learning_rate": 6.772247542101449e-05,
-      "loss": 1.9685,
-      "step": 710
-    },
-    {
-      "epoch": 0.9195402298850575,
-      "grad_norm": 1.9911872148513794,
-      "learning_rate": 6.764134319727477e-05,
-      "loss": 1.8774,
-      "step": 720
-    },
-    {
-      "epoch": 0.9323116219667944,
-      "grad_norm": 1.7452633380889893,
-      "learning_rate": 6.755884140814851e-05,
-      "loss": 1.8693,
-      "step": 730
-    },
-    {
-      "epoch": 0.9450830140485313,
-      "grad_norm": 1.9967455863952637,
-      "learning_rate": 6.747497351524552e-05,
-      "loss": 1.897,
-      "step": 740
-    },
-    {
-      "epoch": 0.9578544061302682,
-      "grad_norm": 1.7866498231887817,
-      "learning_rate": 6.738974303749464e-05,
-      "loss": 1.8744,
-      "step": 750
-    },
-    {
-      "epoch": 0.9578544061302682,
-      "eval_loss": 1.9411674737930298,
-      "eval_runtime": 25.0752,
-      "eval_samples_per_second": 19.94,
-      "eval_steps_per_second": 19.94,
-      "step": 750
-    },
-    {
-      "epoch": 0.9706257982120051,
-      "grad_norm": 1.8407279253005981,
-      "learning_rate": 6.7303153550996e-05,
-      "loss": 1.8936,
-      "step": 760
-    },
-    {
-      "epoch": 0.9833971902937421,
-      "grad_norm": 2.055628538131714,
-      "learning_rate": 6.721520868887103e-05,
-      "loss": 1.8761,
-      "step": 770
-    },
-    {
-      "epoch": 0.9961685823754789,
-      "grad_norm": 1.8534340858459473,
-      "learning_rate": 6.712591214111003e-05,
-      "loss": 1.8828,
-      "step": 780
-    },
-    {
-      "epoch": 1.0089399744572158,
-      "grad_norm": 1.8435447216033936,
-      "learning_rate": 6.703526765441728e-05,
-      "loss": 1.8907,
-      "step": 790
-    },
-    {
-      "epoch": 1.0217113665389528,
-      "grad_norm": 1.971137285232544,
-      "learning_rate": 6.69432790320539e-05,
-      "loss": 1.8402,
-      "step": 800
-    },
-    {
-      "epoch": 1.0344827586206897,
-      "grad_norm": 2.0040283203125,
-      "learning_rate": 6.684995013367826e-05,
-      "loss": 1.9047,
-      "step": 810
-    },
-    {
-      "epoch": 1.0472541507024267,
-      "grad_norm": 1.9755958318710327,
-      "learning_rate": 6.675528487518401e-05,
-      "loss": 1.8602,
-      "step": 820
-    },
-    {
-      "epoch": 1.0600255427841634,
-      "grad_norm": 1.8699226379394531,
-      "learning_rate": 6.665928722853581e-05,
-      "loss": 1.8777,
-      "step": 830
-    },
-    {
-      "epoch": 1.0727969348659003,
-      "grad_norm": 2.011084794998169,
-      "learning_rate": 6.656196122160264e-05,
-      "loss": 1.8186,
-      "step": 840
-    },
-    {
-      "epoch": 1.0855683269476373,
-      "grad_norm": 1.9715994596481323,
-      "learning_rate": 6.646331093798883e-05,
-      "loss": 1.8738,
-      "step": 850
-    },
-    {
-      "epoch": 1.0983397190293742,
-      "grad_norm": 1.997855305671692,
-      "learning_rate": 6.63633405168627e-05,
-      "loss": 1.8658,
-      "step": 860
-    },
-    {
-      "epoch": 1.1111111111111112,
-      "grad_norm": 1.9899482727050781,
-      "learning_rate": 6.626205415278291e-05,
-      "loss": 1.8366,
-      "step": 870
-    },
-    {
-      "epoch": 1.123882503192848,
-      "grad_norm": 1.9865031242370605,
-      "learning_rate": 6.615945609552244e-05,
-      "loss": 1.8194,
-      "step": 880
-    },
-    {
-      "epoch": 1.136653895274585,
-      "grad_norm": 1.8949304819107056,
-      "learning_rate": 6.605555064989027e-05,
-      "loss": 1.8091,
-      "step": 890
-    },
-    {
-      "epoch": 1.1494252873563218,
-      "grad_norm": 2.125781774520874,
-      "learning_rate": 6.595034217555082e-05,
-      "loss": 1.8225,
-      "step": 900
-    },
-    {
-      "epoch": 1.1494252873563218,
-      "eval_loss": 1.9406205415725708,
-      "eval_runtime": 24.2668,
-      "eval_samples_per_second": 20.604,
-      "eval_steps_per_second": 20.604,
-      "step": 900
-    },
-    {
-      "epoch": 1.1621966794380587,
-      "grad_norm": 1.8366427421569824,
-      "learning_rate": 6.584383508684096e-05,
-      "loss": 1.8386,
-      "step": 910
-    },
-    {
-      "epoch": 1.1749680715197957,
-      "grad_norm": 2.075599431991577,
-      "learning_rate": 6.57360338525848e-05,
-      "loss": 1.8182,
-      "step": 920
-    },
-    {
-      "epoch": 1.1877394636015326,
-      "grad_norm": 1.9955580234527588,
-      "learning_rate": 6.562694299590624e-05,
-      "loss": 1.8186,
-      "step": 930
-    },
-    {
-      "epoch": 1.2005108556832695,
-      "grad_norm": 2.012458086013794,
-      "learning_rate": 6.551656709403914e-05,
-      "loss": 1.8484,
-      "step": 940
-    },
-    {
-      "epoch": 1.2132822477650063,
-      "grad_norm": 2.052921772003174,
-      "learning_rate": 6.540491077813528e-05,
-      "loss": 1.8583,
-      "step": 950
-    },
-    {
-      "epoch": 1.2260536398467432,
-      "grad_norm": 1.8998031616210938,
-      "learning_rate": 6.529197873307006e-05,
-      "loss": 1.7882,
-      "step": 960
-    },
-    {
-      "epoch": 1.2388250319284801,
-      "grad_norm": 2.0680484771728516,
-      "learning_rate": 6.517777569724587e-05,
-      "loss": 1.8675,
-      "step": 970
-    },
-    {
-      "epoch": 1.251596424010217,
-      "grad_norm": 2.0891168117523193,
-      "learning_rate": 6.506230646239343e-05,
-      "loss": 1.8287,
-      "step": 980
-    },
-    {
-      "epoch": 1.264367816091954,
-      "grad_norm": 1.9214017391204834,
-      "learning_rate": 6.494557587337054e-05,
-      "loss": 1.81,
-      "step": 990
-    },
-    {
-      "epoch": 1.277139208173691,
-      "grad_norm": 2.0630176067352295,
-      "learning_rate": 6.482758882795892e-05,
-      "loss": 1.8507,
-      "step": 1000
-    },
-    {
-      "epoch": 1.289910600255428,
-      "grad_norm": 2.1162936687469482,
-      "learning_rate": 6.47083502766587e-05,
-      "loss": 1.9005,
-      "step": 1010
-    },
-    {
-      "epoch": 1.3026819923371646,
-      "grad_norm": 2.150144100189209,
-      "learning_rate": 6.458786522248068e-05,
-      "loss": 1.8002,
-      "step": 1020
-    },
-    {
-      "epoch": 1.3154533844189016,
-      "grad_norm": 1.9569624662399292,
-      "learning_rate": 6.446613872073644e-05,
-      "loss": 1.8005,
-      "step": 1030
-    },
-    {
-      "epoch": 1.3282247765006385,
-      "grad_norm": 2.029466390609741,
-      "learning_rate": 6.434317587882618e-05,
-      "loss": 1.8143,
-      "step": 1040
-    },
-    {
-      "epoch": 1.3409961685823755,
-      "grad_norm": 2.1835060119628906,
-      "learning_rate": 6.421898185602448e-05,
-      "loss": 1.9125,
-      "step": 1050
-    },
-    {
-      "epoch": 1.3409961685823755,
-      "eval_loss": 1.9381868839263916,
-      "eval_runtime": 24.2287,
-      "eval_samples_per_second": 20.637,
-      "eval_steps_per_second": 20.637,
-      "step": 1050
-    },
-    {
-      "epoch": 1.3537675606641124,
-      "grad_norm": 2.052374839782715,
-      "learning_rate": 6.409356186326383e-05,
-      "loss": 1.8531,
-      "step": 1060
-    },
-    {
-      "epoch": 1.3665389527458494,
-      "grad_norm": 2.042969226837158,
-      "learning_rate": 6.396692116291589e-05,
-      "loss": 1.8755,
-      "step": 1070
-    },
-    {
-      "epoch": 1.3793103448275863,
-      "grad_norm": 1.9522734880447388,
-      "learning_rate": 6.383906506857083e-05,
-      "loss": 1.8719,
-      "step": 1080
-    },
-    {
-      "epoch": 1.392081736909323,
-      "grad_norm": 1.8483445644378662,
-      "learning_rate": 6.37099989448143e-05,
-      "loss": 1.8538,
-      "step": 1090
-    },
-    {
-      "epoch": 1.40485312899106,
-      "grad_norm": 2.2309532165527344,
-      "learning_rate": 6.357972820700237e-05,
-      "loss": 1.8851,
-      "step": 1100
-    },
-    {
-      "epoch": 1.417624521072797,
-      "grad_norm": 1.9797334671020508,
-      "learning_rate": 6.344825832103429e-05,
-      "loss": 1.8704,
-      "step": 1110
-    },
-    {
-      "epoch": 1.4303959131545338,
-      "grad_norm": 2.023789167404175,
-      "learning_rate": 6.331559480312315e-05,
-      "loss": 1.8692,
-      "step": 1120
-    },
-    {
-      "epoch": 1.4431673052362708,
-      "grad_norm": 1.9136630296707153,
-      "learning_rate": 6.318174321956448e-05,
-      "loss": 1.8521,
-      "step": 1130
-    },
-    {
-      "epoch": 1.4559386973180077,
-      "grad_norm": 2.0023677349090576,
-      "learning_rate": 6.304670918650265e-05,
-      "loss": 1.8565,
-      "step": 1140
-    },
-    {
-      "epoch": 1.4687100893997447,
-      "grad_norm": 2.022937536239624,
-      "learning_rate": 6.291049836969522e-05,
-      "loss": 1.8222,
-      "step": 1150
-    },
-    {
-      "epoch": 1.4814814814814814,
-      "grad_norm": 2.031189203262329,
-      "learning_rate": 6.277311648427529e-05,
-      "loss": 1.8933,
-      "step": 1160
-    },
-    {
-      "epoch": 1.4942528735632183,
-      "grad_norm": 2.021033763885498,
-      "learning_rate": 6.26345692945116e-05,
-      "loss": 1.8851,
-      "step": 1170
-    },
-    {
-      "epoch": 1.5070242656449553,
-      "grad_norm": 1.9933768510818481,
-      "learning_rate": 6.249486261356676e-05,
-      "loss": 1.8925,
-      "step": 1180
-    },
-    {
-      "epoch": 1.5197956577266922,
-      "grad_norm": 2.0429482460021973,
-      "learning_rate": 6.23540023032533e-05,
-      "loss": 1.8536,
-      "step": 1190
-    },
-    {
-      "epoch": 1.5325670498084292,
-      "grad_norm": 1.950465440750122,
-      "learning_rate": 6.221199427378772e-05,
-      "loss": 1.9116,
-      "step": 1200
-    },
-    {
-      "epoch": 1.5325670498084292,
-      "eval_loss": 1.9389454126358032,
-      "eval_runtime": 24.2278,
-      "eval_samples_per_second": 20.637,
-      "eval_steps_per_second": 20.637,
-      "step": 1200
-    },
-    {
-      "epoch": 1.545338441890166,
-      "grad_norm": 1.9224931001663208,
-      "learning_rate": 6.206884448354253e-05,
-      "loss": 1.8073,
-      "step": 1210
-    },
-    {
-      "epoch": 1.558109833971903,
-      "grad_norm": 1.9591902494430542,
-      "learning_rate": 6.192455893879623e-05,
-      "loss": 1.8249,
-      "step": 1220
-    },
-    {
-      "epoch": 1.5708812260536398,
-      "grad_norm": 1.9398179054260254,
-      "learning_rate": 6.177914369348129e-05,
-      "loss": 1.8701,
-      "step": 1230
-    },
-    {
-      "epoch": 1.5836526181353767,
-      "grad_norm": 1.8603562116622925,
-      "learning_rate": 6.163260484893019e-05,
-      "loss": 1.8398,
-      "step": 1240
-    },
-    {
-      "epoch": 1.5964240102171137,
-      "grad_norm": 2.005242347717285,
-      "learning_rate": 6.148494855361933e-05,
-      "loss": 1.8626,
-      "step": 1250
-    },
-    {
-      "epoch": 1.6091954022988506,
-      "grad_norm": 1.8576045036315918,
-      "learning_rate": 6.133618100291116e-05,
-      "loss": 1.8573,
-      "step": 1260
-    },
-    {
-      "epoch": 1.6219667943805876,
-      "grad_norm": 2.0463547706604004,
-      "learning_rate": 6.118630843879414e-05,
-      "loss": 1.8162,
-      "step": 1270
-    },
-    {
-      "epoch": 1.6347381864623243,
-      "grad_norm": 1.9812066555023193,
-      "learning_rate": 6.10353371496209e-05,
-      "loss": 1.8147,
-      "step": 1280
-    },
-    {
-      "epoch": 1.6475095785440614,
-      "grad_norm": 2.0018861293792725,
-      "learning_rate": 6.088327346984437e-05,
-      "loss": 1.7751,
-      "step": 1290
-    },
-    {
-      "epoch": 1.6602809706257982,
-      "grad_norm": 2.017937183380127,
-      "learning_rate": 6.073012377975201e-05,
-      "loss": 1.8877,
-      "step": 1300
-    },
-    {
-      "epoch": 1.673052362707535,
-      "grad_norm": 1.9787315130233765,
-      "learning_rate": 6.057589450519807e-05,
-      "loss": 1.8283,
-      "step": 1310
-    },
-    {
-      "epoch": 1.685823754789272,
-      "grad_norm": 2.075465679168701,
-      "learning_rate": 6.042059211733404e-05,
-      "loss": 1.8498,
-      "step": 1320
-    },
-    {
-      "epoch": 1.698595146871009,
-      "grad_norm": 2.0676026344299316,
-      "learning_rate": 6.026422313233707e-05,
-      "loss": 1.8781,
-      "step": 1330
-    },
-    {
-      "epoch": 1.711366538952746,
-      "grad_norm": 2.158276319503784,
-      "learning_rate": 6.010679411113658e-05,
-      "loss": 1.8436,
-      "step": 1340
-    },
-    {
-      "epoch": 1.7241379310344827,
-      "grad_norm": 2.1169486045837402,
-      "learning_rate": 5.9948311659139024e-05,
-      "loss": 1.8629,
-      "step": 1350
-    },
-    {
-      "epoch": 1.7241379310344827,
-      "eval_loss": 1.9343986511230469,
-      "eval_runtime": 24.1802,
-      "eval_samples_per_second": 20.678,
-      "eval_steps_per_second": 20.678,
-      "step": 1350
-    },
-    {
-      "epoch": 1.7369093231162198,
-      "grad_norm": 1.857434868812561,
-      "learning_rate": 5.978878242595065e-05,
-      "loss": 1.8505,
-      "step": 1360
-    },
-    {
-      "epoch": 1.7496807151979565,
-      "grad_norm": 2.055877923965454,
-      "learning_rate": 5.962821310509861e-05,
-      "loss": 1.8437,
-      "step": 1370
-    },
-    {
-      "epoch": 1.7624521072796935,
-      "grad_norm": 1.8985681533813477,
-      "learning_rate": 5.9466610433750005e-05,
-      "loss": 1.8312,
-      "step": 1380
-    },
-    {
-      "epoch": 1.7752234993614304,
-      "grad_norm": 1.991992712020874,
-      "learning_rate": 5.9303981192429265e-05,
-      "loss": 1.9209,
-      "step": 1390
-    },
-    {
-      "epoch": 1.7879948914431671,
-      "grad_norm": 1.845304012298584,
-      "learning_rate": 5.914033220473365e-05,
-      "loss": 1.85,
-      "step": 1400
-    },
-    {
-      "epoch": 1.8007662835249043,
-      "grad_norm": 1.8313244581222534,
-      "learning_rate": 5.897567033704691e-05,
-      "loss": 1.8897,
-      "step": 1410
-    },
-    {
-      "epoch": 1.813537675606641,
-      "grad_norm": 1.8554408550262451,
-      "learning_rate": 5.881000249825124e-05,
-      "loss": 1.8012,
-      "step": 1420
-    },
-    {
-      "epoch": 1.8263090676883782,
-      "grad_norm": 2.053835868835449,
-      "learning_rate": 5.8643335639437366e-05,
-      "loss": 1.8708,
-      "step": 1430
-    },
-    {
-      "epoch": 1.839080459770115,
-      "grad_norm": 1.9372299909591675,
-      "learning_rate": 5.847567675361288e-05,
-      "loss": 1.8338,
-      "step": 1440
-    },
-    {
-      "epoch": 1.8518518518518519,
-      "grad_norm": 2.2738914489746094,
-      "learning_rate": 5.830703287540883e-05,
-      "loss": 1.8748,
-      "step": 1450
-    },
-    {
-      "epoch": 1.8646232439335888,
-      "grad_norm": 2.297995090484619,
-      "learning_rate": 5.813741108078461e-05,
-      "loss": 1.8276,
-      "step": 1460
-    },
-    {
-      "epoch": 1.8773946360153255,
-      "grad_norm": 1.985796332359314,
-      "learning_rate": 5.796681848673098e-05,
-      "loss": 1.9204,
-      "step": 1470
-    },
-    {
-      "epoch": 1.8901660280970627,
-      "grad_norm": 1.8658971786499023,
-      "learning_rate": 5.779526225097153e-05,
-      "loss": 1.8205,
-      "step": 1480
-    },
-    {
-      "epoch": 1.9029374201787994,
-      "grad_norm": 1.9643296003341675,
-      "learning_rate": 5.762274957166233e-05,
-      "loss": 1.8421,
-      "step": 1490
-    },
-    {
-      "epoch": 1.9157088122605364,
-      "grad_norm": 1.924149751663208,
-      "learning_rate": 5.7449287687089895e-05,
-      "loss": 1.8696,
-      "step": 1500
-    },
-    {
-      "epoch": 1.9157088122605364,
-      "eval_loss": 1.9337016344070435,
-      "eval_runtime": 24.181,
-      "eval_samples_per_second": 20.677,
-      "eval_steps_per_second": 20.677,
-      "step": 1500
-    },
-    {
-      "epoch": 1.9284802043422733,
-      "grad_norm": 1.9967809915542603,
-      "learning_rate": 5.727488387536748e-05,
-      "loss": 1.8602,
-      "step": 1510
-    },
-    {
-      "epoch": 1.9412515964240102,
-      "grad_norm": 1.9925228357315063,
-      "learning_rate": 5.709954545412975e-05,
-      "loss": 1.8775,
-      "step": 1520
-    },
-    {
-      "epoch": 1.9540229885057472,
-      "grad_norm": 2.0943105220794678,
-      "learning_rate": 5.692327978022566e-05,
-      "loss": 1.8434,
-      "step": 1530
-    },
-    {
-      "epoch": 1.966794380587484,
-      "grad_norm": 1.9929866790771484,
-      "learning_rate": 5.67460942494099e-05,
-      "loss": 1.8754,
-      "step": 1540
-    },
-    {
-      "epoch": 1.979565772669221,
-      "grad_norm": 1.9730305671691895,
-      "learning_rate": 5.656799629603245e-05,
-      "loss": 1.8585,
-      "step": 1550
-    },
-    {
-      "epoch": 1.9923371647509578,
-      "grad_norm": 2.1760172843933105,
-      "learning_rate": 5.638899339272675e-05,
-      "loss": 1.8677,
-      "step": 1560
-    },
-    {
-      "epoch": 2.005108556832695,
-      "grad_norm": 1.958113193511963,
-      "learning_rate": 5.6209093050096115e-05,
-      "loss": 1.7717,
-      "step": 1570
-    },
-    {
-      "epoch": 2.0178799489144317,
-      "grad_norm": 1.8135682344436646,
-      "learning_rate": 5.6028302816398624e-05,
-      "loss": 1.7952,
-      "step": 1580
-    },
-    {
-      "epoch": 2.0306513409961684,
-      "grad_norm": 1.9992247819900513,
-      "learning_rate": 5.5846630277230384e-05,
-      "loss": 1.8559,
-      "step": 1590
-    },
-    {
-      "epoch": 2.0434227330779056,
-      "grad_norm": 1.9503588676452637,
-      "learning_rate": 5.566408305520729e-05,
-      "loss": 1.8136,
-      "step": 1600
-    },
-    {
-      "epoch": 2.0561941251596423,
-      "grad_norm": 2.0600390434265137,
-      "learning_rate": 5.5480668809645175e-05,
-      "loss": 1.8126,
-      "step": 1610
-    },
-    {
-      "epoch": 2.0689655172413794,
-      "grad_norm": 1.9783591032028198,
-      "learning_rate": 5.5296395236238445e-05,
-      "loss": 1.8198,
-      "step": 1620
-    },
-    {
-      "epoch": 2.081736909323116,
-      "grad_norm": 2.09946346282959,
-      "learning_rate": 5.5111270066737163e-05,
-      "loss": 1.7505,
-      "step": 1630
-    },
-    {
-      "epoch": 2.0945083014048533,
-      "grad_norm": 1.9546903371810913,
-      "learning_rate": 5.492530106862269e-05,
-      "loss": 1.8198,
-      "step": 1640
-    },
-    {
-      "epoch": 2.10727969348659,
-      "grad_norm": 2.0820565223693848,
-      "learning_rate": 5.4738496044781724e-05,
-      "loss": 1.8473,
-      "step": 1650
-    },
-    {
-      "epoch": 2.10727969348659,
-      "eval_loss": 1.9384523630142212,
-      "eval_runtime": 24.2888,
-      "eval_samples_per_second": 20.586,
-      "eval_steps_per_second": 20.586,
-      "step": 1650
-    },
-    {
-      "epoch": 2.1200510855683268,
-      "grad_norm": 2.078505516052246,
-      "learning_rate": 5.4550862833178926e-05,
-      "loss": 1.8142,
-      "step": 1660
-    },
-    {
-      "epoch": 2.132822477650064,
-      "grad_norm": 2.0138731002807617,
-      "learning_rate": 5.436240930652807e-05,
-      "loss": 1.8039,
-      "step": 1670
-    },
-    {
-      "epoch": 2.1455938697318007,
-      "grad_norm": 1.958559513092041,
-      "learning_rate": 5.41731433719617e-05,
-      "loss": 1.8039,
-      "step": 1680
-    },
-    {
-      "epoch": 2.158365261813538,
-      "grad_norm": 2.14679217338562,
-      "learning_rate": 5.398307297069936e-05,
-      "loss": 1.7873,
-      "step": 1690
-    },
-    {
-      "epoch": 2.1711366538952745,
-      "grad_norm": 2.2213780879974365,
-      "learning_rate": 5.379220607771443e-05,
-      "loss": 1.7886,
-      "step": 1700
-    },
-    {
-      "epoch": 2.1839080459770113,
-      "grad_norm": 1.9779000282287598,
-      "learning_rate": 5.3600550701399455e-05,
-      "loss": 1.7599,
-      "step": 1710
-    },
-    {
-      "epoch": 2.1966794380587484,
-      "grad_norm": 2.0890724658966064,
-      "learning_rate": 5.340811488323019e-05,
-      "loss": 1.7752,
-      "step": 1720
-    },
-    {
-      "epoch": 2.209450830140485,
-      "grad_norm": 2.2274386882781982,
-      "learning_rate": 5.321490669742815e-05,
-      "loss": 1.8219,
-      "step": 1730
-    },
-    {
-      "epoch": 2.2222222222222223,
-      "grad_norm": 2.2681832313537598,
-      "learning_rate": 5.302093425062187e-05,
-      "loss": 1.7979,
-      "step": 1740
-    },
-    {
-      "epoch": 2.234993614303959,
-      "grad_norm": 2.136756420135498,
-      "learning_rate": 5.28262056815067e-05,
-      "loss": 1.7548,
-      "step": 1750
-    },
-    {
-      "epoch": 2.247765006385696,
-      "grad_norm": 2.3203325271606445,
-      "learning_rate": 5.263072916050342e-05,
-      "loss": 1.8238,
-      "step": 1760
-    },
-    {
-      "epoch": 2.260536398467433,
-      "grad_norm": 2.19116473197937,
-      "learning_rate": 5.243451288941535e-05,
-      "loss": 1.8143,
-      "step": 1770
-    },
-    {
-      "epoch": 2.27330779054917,
-      "grad_norm": 2.180558204650879,
-      "learning_rate": 5.2237565101084235e-05,
-      "loss": 1.7681,
-      "step": 1780
-    },
-    {
-      "epoch": 2.286079182630907,
-      "grad_norm": 2.287687063217163,
-      "learning_rate": 5.203989405904483e-05,
-      "loss": 1.7872,
-      "step": 1790
-    },
-    {
-      "epoch": 2.2988505747126435,
-      "grad_norm": 2.288097620010376,
-      "learning_rate": 5.1841508057178177e-05,
-      "loss": 1.7939,
-      "step": 1800
-    },
-    {
-      "epoch": 2.2988505747126435,
-      "eval_loss": 1.9397783279418945,
-      "eval_runtime": 24.3943,
-      "eval_samples_per_second": 20.497,
-      "eval_steps_per_second": 20.497,
-      "step": 1800
-    },
-    {
-      "epoch": 2.3116219667943807,
-      "grad_norm": 2.1380770206451416,
-      "learning_rate": 1.728050268572606e-06,
-      "loss": 1.7778,
-      "step": 1810
-    },
-    {
-      "epoch": 2.3243933588761174,
-      "grad_norm": 2.165830135345459,
-      "learning_rate": 3.456100537145212e-06,
-      "loss": 1.7769,
-      "step": 1820
-    },
-    {
-      "epoch": 2.3371647509578546,
-      "grad_norm": 2.399066209793091,
-      "learning_rate": 5.184150805717818e-06,
-      "loss": 1.7873,
-      "step": 1830
-    },
-    {
-      "epoch": 2.3499361430395913,
-      "grad_norm": 2.234840154647827,
-      "learning_rate": 6.912201074290424e-06,
-      "loss": 1.7203,
-      "step": 1840
-    },
-    {
-      "epoch": 2.362707535121328,
-      "grad_norm": 2.12174916267395,
-      "learning_rate": 8.640251342863028e-06,
-      "loss": 1.8158,
-      "step": 1850
-    },
-    {
-      "epoch": 2.375478927203065,
-      "grad_norm": 2.109421968460083,
-      "learning_rate": 1.0368301611435635e-05,
-      "loss": 1.7586,
-      "step": 1860
-    },
-    {
-      "epoch": 2.388250319284802,
-      "grad_norm": 2.1194908618927,
-      "learning_rate": 1.209635188000824e-05,
-      "loss": 1.7936,
-      "step": 1870
-    },
-    {
-      "epoch": 2.401021711366539,
-      "grad_norm": 2.211038589477539,
-      "learning_rate": 1.3824402148580848e-05,
-      "loss": 1.7774,
-      "step": 1880
-    },
-    {
-      "epoch": 2.413793103448276,
-      "grad_norm": 2.295914888381958,
-      "learning_rate": 1.5552452417153453e-05,
-      "loss": 1.817,
-      "step": 1890
-    },
-    {
-      "epoch": 2.4265644955300125,
-      "grad_norm": 2.0694994926452637,
-      "learning_rate": 1.7280502685726057e-05,
-      "loss": 1.8039,
-      "step": 1900
-    },
-    {
-      "epoch": 2.4393358876117497,
-      "grad_norm": 2.2239274978637695,
-      "learning_rate": 1.9008552954298664e-05,
-      "loss": 1.8505,
-      "step": 1910
-    },
-    {
-      "epoch": 2.4521072796934864,
-      "grad_norm": 2.0061118602752686,
-      "learning_rate": 2.073660322287127e-05,
-      "loss": 1.7951,
-      "step": 1920
-    },
-    {
-      "epoch": 2.4648786717752236,
-      "grad_norm": 2.198082447052002,
-      "learning_rate": 2.2464653491443878e-05,
-      "loss": 1.8012,
-      "step": 1930
-    },
-    {
-      "epoch": 2.4776500638569603,
-      "grad_norm": 1.9910808801651,
-      "learning_rate": 2.419270376001648e-05,
-      "loss": 1.695,
-      "step": 1940
-    },
-    {
-      "epoch": 2.4904214559386975,
-      "grad_norm": 2.2533199787139893,
-      "learning_rate": 2.5920754028589088e-05,
-      "loss": 1.7852,
-      "step": 1950
-    },
-    {
-      "epoch": 2.4904214559386975,
-      "eval_loss": 1.936609148979187,
-      "eval_runtime": 24.3942,
-      "eval_samples_per_second": 20.497,
-      "eval_steps_per_second": 20.497,
-      "step": 1950
-    },
-    {
-      "epoch": 2.503192848020434,
-      "grad_norm": 2.3623099327087402,
-      "learning_rate": 2.5920482132857834e-05,
-      "loss": 1.8228,
-      "step": 1960
-    },
-    {
-      "epoch": 2.5159642401021713,
-      "grad_norm": 2.2549827098846436,
-      "learning_rate": 2.5919666457072272e-05,
-      "loss": 1.7792,
-      "step": 1970
-    },
-    {
-      "epoch": 2.528735632183908,
-      "grad_norm": 2.1171176433563232,
-      "learning_rate": 2.5918307035456523e-05,
-      "loss": 1.8581,
-      "step": 1980
-    },
-    {
-      "epoch": 2.541507024265645,
-      "grad_norm": 2.118044137954712,
-      "learning_rate": 2.5916403925049198e-05,
-      "loss": 1.8363,
-      "step": 1990
-    },
-    {
-      "epoch": 2.554278416347382,
-      "grad_norm": 2.404895782470703,
-      "learning_rate": 2.5913957205700998e-05,
-      "loss": 1.8638,
-      "step": 2000
-    },
-    {
-      "epoch": 2.5670498084291187,
-      "grad_norm": 2.2589292526245117,
-      "learning_rate": 2.591096698007136e-05,
-      "loss": 1.817,
-      "step": 2010
-    },
-    {
-      "epoch": 2.579821200510856,
-      "grad_norm": 2.308377981185913,
-      "learning_rate": 2.5907433373624172e-05,
-      "loss": 1.7985,
-      "step": 2020
-    },
-    {
-      "epoch": 2.5925925925925926,
-      "grad_norm": 2.1969752311706543,
-      "learning_rate": 2.5903356534622476e-05,
-      "loss": 1.8013,
-      "step": 2030
-    },
-    {
-      "epoch": 2.6053639846743293,
-      "grad_norm": 2.2254557609558105,
-      "learning_rate": 2.5898736634122276e-05,
-      "loss": 1.7312,
-      "step": 2040
-    },
-    {
-      "epoch": 2.6181353767560664,
-      "grad_norm": 2.2725250720977783,
-      "learning_rate": 2.5893573865965345e-05,
-      "loss": 1.8751,
-      "step": 2050
-    },
-    {
-      "epoch": 2.630906768837803,
-      "grad_norm": 2.1987719535827637,
-      "learning_rate": 2.588786844677109e-05,
-      "loss": 1.8096,
-      "step": 2060
-    },
-    {
-      "epoch": 2.6436781609195403,
-      "grad_norm": 2.1684072017669678,
-      "learning_rate": 2.588162061592748e-05,
-      "loss": 1.738,
-      "step": 2070
-    },
-    {
-      "epoch": 2.656449553001277,
-      "grad_norm": 2.2119176387786865,
-      "learning_rate": 2.5874830635580974e-05,
-      "loss": 1.7888,
-      "step": 2080
-    },
-    {
-      "epoch": 2.6692209450830138,
-      "grad_norm": 2.2012903690338135,
-      "learning_rate": 2.586749879062556e-05,
-      "loss": 1.8119,
-      "step": 2090
-    },
-    {
-      "epoch": 2.681992337164751,
-      "grad_norm": 2.18770694732666,
-      "learning_rate": 2.5859625388690762e-05,
-      "loss": 1.7867,
-      "step": 2100
-    },
-    {
-      "epoch": 2.681992337164751,
-      "eval_loss": 1.9344319105148315,
-      "eval_runtime": 24.3873,
-      "eval_samples_per_second": 20.502,
-      "eval_steps_per_second": 20.502,
-      "step": 2100
     }
   ],
   "logging_steps": 10,
@@ -1608,7 +139,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 2.9127653767397376e+16,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null

 {
+  "best_metric": 1.9779324531555176,
+  "best_model_checkpoint": "./output/checkpoint-150",
+  "epoch": 0.19157088122605365,
   "eval_steps": 150,
+  "global_step": 150,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.01277139208173691,
+      "grad_norm": 2.1240530014038086,
       "learning_rate": 4.666666666666666e-06,
+      "loss": 1.9176,
       "step": 10
     },
     {
       "epoch": 0.02554278416347382,
+      "grad_norm": 2.017596960067749,
       "learning_rate": 9.333333333333333e-06,
+      "loss": 1.9414,
       "step": 20
     },
     {
       "epoch": 0.038314176245210725,
+      "grad_norm": 2.077119827270508,
       "learning_rate": 1.4e-05,
+      "loss": 1.9121,
       "step": 30
     },
     {
       "epoch": 0.05108556832694764,
+      "grad_norm": 1.9572290182113647,
       "learning_rate": 1.8666666666666665e-05,
+      "loss": 1.8867,
       "step": 40
     },
     {
       "epoch": 0.06385696040868455,
+      "grad_norm": 2.5854249000549316,
       "learning_rate": 2.333333333333333e-05,
       "loss": 1.9779,
       "step": 50
     },
     {
       "epoch": 0.07662835249042145,
+      "grad_norm": 2.080289840698242,
       "learning_rate": 2.8e-05,
+      "loss": 1.9335,
       "step": 60
     },
     {
       "epoch": 0.08939974457215837,
+      "grad_norm": 1.978735089302063,
       "learning_rate": 3.266666666666666e-05,
+      "loss": 1.9452,
       "step": 70
     },
     {
       "epoch": 0.10217113665389528,
+      "grad_norm": 2.1327736377716064,
       "learning_rate": 3.733333333333333e-05,
+      "loss": 1.9443,
       "step": 80
     },
     {
       "epoch": 0.11494252873563218,
+      "grad_norm": 2.057833194732666,
       "learning_rate": 4.2e-05,
+      "loss": 1.9261,
       "step": 90
     },
     {
       "epoch": 0.1277139208173691,
+      "grad_norm": 2.1436352729797363,
       "learning_rate": 4.666666666666666e-05,
+      "loss": 1.9776,
       "step": 100
     },
     {
       "epoch": 0.140485312899106,
+      "grad_norm": 1.885575294494629,
       "learning_rate": 5.1333333333333325e-05,
       "loss": 1.9233,
       "step": 110
     },
     {
       "epoch": 0.1532567049808429,
+      "grad_norm": 2.046653985977173,
       "learning_rate": 5.6e-05,
+      "loss": 1.9403,
       "step": 120
     },
     {
       "epoch": 0.16602809706257982,
+      "grad_norm": 2.272224187850952,
       "learning_rate": 6.0666666666666666e-05,
+      "loss": 1.9421,
       "step": 130
     },
     {
       "epoch": 0.17879948914431673,
+      "grad_norm": 2.098900318145752,
       "learning_rate": 6.533333333333333e-05,
+      "loss": 1.919,
       "step": 140
     },
     {
       "epoch": 0.19157088122605365,
+      "grad_norm": 2.07694411277771,
       "learning_rate": 7e-05,
       "loss": 1.9405,
       "step": 150
     },
     {
       "epoch": 0.19157088122605365,
+      "eval_loss": 1.9779324531555176,
+      "eval_runtime": 25.3379,
+      "eval_samples_per_second": 19.733,
+      "eval_steps_per_second": 19.733,
       "step": 150
     }
   ],
   "logging_steps": 10,
       "attributes": {}
     }
   },
+  "total_flos": 2084245000224768.0,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:73d63faa96a8013f73d4d225b5f62be5f6f1a8819c12a7d65e93c26570162b6b
 size 5496

 version https://git-lfs.github.com/spec/v1
+oid sha256:5d09d197d3b76d99965de547b76e3d4e9afb2509c824e5faadb664c5461d6450
 size 5496