diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,4171 @@
+{
+  "best_global_step": 55000,
+  "best_metric": 0.8768783517240833,
+  "best_model_checkpoint": "./lang-ner-xlmr/checkpoint-55000",
+  "epoch": 2.0,
+  "eval_steps": 2500,
+  "global_step": 55278,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0036180759072325336,
+      "grad_norm": 5.75448751449585,
+      "learning_rate": 4.9910452621295995e-05,
+      "loss": 4.179392395019531,
+      "step": 100
+    },
+    {
+      "epoch": 0.007236151814465067,
+      "grad_norm": 2.6520659923553467,
+      "learning_rate": 4.9820000723615186e-05,
+      "loss": 0.6058632278442383,
+      "step": 200
+    },
+    {
+      "epoch": 0.010854227721697602,
+      "grad_norm": 3.474226951599121,
+      "learning_rate": 4.972954882593437e-05,
+      "loss": 0.3028737449645996,
+      "step": 300
+    },
+    {
+      "epoch": 0.014472303628930134,
+      "grad_norm": 1.4948221445083618,
+      "learning_rate": 4.963909692825356e-05,
+      "loss": 0.18973339080810547,
+      "step": 400
+    },
+    {
+      "epoch": 0.01809037953616267,
+      "grad_norm": 1.389740228652954,
+      "learning_rate": 4.9548645030572745e-05,
+      "loss": 0.15398676872253417,
+      "step": 500
+    },
+    {
+      "epoch": 0.021708455443395204,
+      "grad_norm": 1.4510504007339478,
+      "learning_rate": 4.945819313289193e-05,
+      "loss": 0.13108017921447754,
+      "step": 600
+    },
+    {
+      "epoch": 0.025326531350627735,
+      "grad_norm": 1.4420865774154663,
+      "learning_rate": 4.936774123521112e-05,
+      "loss": 0.12688090324401854,
+      "step": 700
+    },
+    {
+      "epoch": 0.02894460725786027,
+      "grad_norm": 0.9447225332260132,
+      "learning_rate": 4.92772893375303e-05,
+      "loss": 0.11376466751098632,
+      "step": 800
+    },
+    {
+      "epoch": 0.0325626831650928,
+      "grad_norm": 1.9140123128890991,
+      "learning_rate": 4.9186837439849494e-05,
+      "loss": 0.10734249114990234,
+      "step": 900
+    },
+    {
+      "epoch": 0.03618075907232534,
+      "grad_norm": 1.2182528972625732,
+      "learning_rate": 4.909638554216868e-05,
+      "loss": 0.09950636863708497,
+      "step": 1000
+    },
+    {
+      "epoch": 0.03979883497955787,
+      "grad_norm": 1.5587440729141235,
+      "learning_rate": 4.900593364448786e-05,
+      "loss": 0.08896804809570312,
+      "step": 1100
+    },
+    {
+      "epoch": 0.04341691088679041,
+      "grad_norm": 2.021667242050171,
+      "learning_rate": 4.891548174680705e-05,
+      "loss": 0.09553884506225586,
+      "step": 1200
+    },
+    {
+      "epoch": 0.04703498679402294,
+      "grad_norm": 3.561288595199585,
+      "learning_rate": 4.882502984912624e-05,
+      "loss": 0.0916118335723877,
+      "step": 1300
+    },
+    {
+      "epoch": 0.05065306270125547,
+      "grad_norm": 2.239180088043213,
+      "learning_rate": 4.873457795144543e-05,
+      "loss": 0.08524966239929199,
+      "step": 1400
+    },
+    {
+      "epoch": 0.054271138608488007,
+      "grad_norm": 1.880850076675415,
+      "learning_rate": 4.864412605376461e-05,
+      "loss": 0.08407029151916504,
+      "step": 1500
+    },
+    {
+      "epoch": 0.05788921451572054,
+      "grad_norm": 2.365021228790283,
+      "learning_rate": 4.8553674156083796e-05,
+      "loss": 0.09083961486816407,
+      "step": 1600
+    },
+    {
+      "epoch": 0.061507290422953075,
+      "grad_norm": 1.8810335397720337,
+      "learning_rate": 4.8463222258402987e-05,
+      "loss": 0.0841958236694336,
+      "step": 1700
+    },
+    {
+      "epoch": 0.0651253663301856,
+      "grad_norm": 1.7592241764068604,
+      "learning_rate": 4.837277036072217e-05,
+      "loss": 0.08484026908874512,
+      "step": 1800
+    },
+    {
+      "epoch": 0.06874344223741814,
+      "grad_norm": 1.4012072086334229,
+      "learning_rate": 4.828231846304136e-05,
+      "loss": 0.07917069911956787,
+      "step": 1900
+    },
+    {
+      "epoch": 0.07236151814465068,
+      "grad_norm": 1.6757310628890991,
+      "learning_rate": 4.8191866565360545e-05,
+      "loss": 0.0806041145324707,
+      "step": 2000
+    },
+    {
+      "epoch": 0.0759795940518832,
+      "grad_norm": 0.6598155498504639,
+      "learning_rate": 4.810141466767973e-05,
+      "loss": 0.07851210594177246,
+      "step": 2100
+    },
+    {
+      "epoch": 0.07959766995911574,
+      "grad_norm": 1.5423673391342163,
+      "learning_rate": 4.801096276999892e-05,
+      "loss": 0.08287395477294922,
+      "step": 2200
+    },
+    {
+      "epoch": 0.08321574586634828,
+      "grad_norm": 0.4928501546382904,
+      "learning_rate": 4.7920510872318104e-05,
+      "loss": 0.07287377834320069,
+      "step": 2300
+    },
+    {
+      "epoch": 0.08683382177358082,
+      "grad_norm": 1.8151744604110718,
+      "learning_rate": 4.7830058974637295e-05,
+      "loss": 0.06640945911407471,
+      "step": 2400
+    },
+    {
+      "epoch": 0.09045189768081334,
+      "grad_norm": 1.1932594776153564,
+      "learning_rate": 4.773960707695648e-05,
+      "loss": 0.07295094966888428,
+      "step": 2500
+    },
+    {
+      "epoch": 0.09045189768081334,
+      "eval_accuracy": 0.975962734636331,
+      "eval_f1": 0.7717093579748968,
+      "eval_loss": 0.10806787014007568,
+      "eval_precision": 0.7241184528264584,
+      "eval_recall": 0.8259959084392468,
+      "eval_runtime": 117.8075,
+      "eval_samples_per_second": 169.768,
+      "eval_steps_per_second": 4.72,
+      "step": 2500
+    },
+    {
+      "epoch": 0.09406997358804588,
+      "grad_norm": 1.0983343124389648,
+      "learning_rate": 4.764915517927566e-05,
+      "loss": 0.06925168514251709,
+      "step": 2600
+    },
+    {
+      "epoch": 0.09768804949527841,
+      "grad_norm": 0.8816857933998108,
+      "learning_rate": 4.7558703281594854e-05,
+      "loss": 0.06958985328674316,
+      "step": 2700
+    },
+    {
+      "epoch": 0.10130612540251094,
+      "grad_norm": 0.8671173453330994,
+      "learning_rate": 4.746825138391404e-05,
+      "loss": 0.07468698024749756,
+      "step": 2800
+    },
+    {
+      "epoch": 0.10492420130974348,
+      "grad_norm": 0.27838993072509766,
+      "learning_rate": 4.737779948623322e-05,
+      "loss": 0.07403119087219238,
+      "step": 2900
+    },
+    {
+      "epoch": 0.10854227721697601,
+      "grad_norm": 0.4557673931121826,
+      "learning_rate": 4.728734758855241e-05,
+      "loss": 0.07262114524841308,
+      "step": 3000
+    },
+    {
+      "epoch": 0.11216035312420855,
+      "grad_norm": 0.8267778158187866,
+      "learning_rate": 4.71968956908716e-05,
+      "loss": 0.07057662963867188,
+      "step": 3100
+    },
+    {
+      "epoch": 0.11577842903144107,
+      "grad_norm": 1.401780128479004,
+      "learning_rate": 4.710644379319079e-05,
+      "loss": 0.06252509117126465,
+      "step": 3200
+    },
+    {
+      "epoch": 0.11939650493867361,
+      "grad_norm": 1.7423473596572876,
+      "learning_rate": 4.701599189550997e-05,
+      "loss": 0.06425057411193848,
+      "step": 3300
+    },
+    {
+      "epoch": 0.12301458084590615,
+      "grad_norm": 0.7547276616096497,
+      "learning_rate": 4.6925539997829156e-05,
+      "loss": 0.06438188076019287,
+      "step": 3400
+    },
+    {
+      "epoch": 0.12663265675313867,
+      "grad_norm": 0.4259902238845825,
+      "learning_rate": 4.6835088100148346e-05,
+      "loss": 0.0666530466079712,
+      "step": 3500
+    },
+    {
+      "epoch": 0.1302507326603712,
+      "grad_norm": 0.42786452174186707,
+      "learning_rate": 4.674463620246753e-05,
+      "loss": 0.05976760864257812,
+      "step": 3600
+    },
+    {
+      "epoch": 0.13386880856760375,
+      "grad_norm": 1.1275266408920288,
+      "learning_rate": 4.665418430478672e-05,
+      "loss": 0.06228343009948731,
+      "step": 3700
+    },
+    {
+      "epoch": 0.13748688447483629,
+      "grad_norm": 1.345894455909729,
+      "learning_rate": 4.6563732407105905e-05,
+      "loss": 0.0695729398727417,
+      "step": 3800
+    },
+    {
+      "epoch": 0.14110496038206882,
+      "grad_norm": 0.5640186071395874,
+      "learning_rate": 4.647328050942509e-05,
+      "loss": 0.06416056156158448,
+      "step": 3900
+    },
+    {
+      "epoch": 0.14472303628930136,
+      "grad_norm": 1.5667623281478882,
+      "learning_rate": 4.638282861174428e-05,
+      "loss": 0.06927279949188232,
+      "step": 4000
+    },
+    {
+      "epoch": 0.14834111219653387,
+      "grad_norm": 0.4014199674129486,
+      "learning_rate": 4.6292376714063464e-05,
+      "loss": 0.060500779151916505,
+      "step": 4100
+    },
+    {
+      "epoch": 0.1519591881037664,
+      "grad_norm": 0.8349173069000244,
+      "learning_rate": 4.6201924816382655e-05,
+      "loss": 0.05734441757202149,
+      "step": 4200
+    },
+    {
+      "epoch": 0.15557726401099894,
+      "grad_norm": 0.48946359753608704,
+      "learning_rate": 4.611147291870184e-05,
+      "loss": 0.0637766456604004,
+      "step": 4300
+    },
+    {
+      "epoch": 0.15919533991823148,
+      "grad_norm": 0.44791749119758606,
+      "learning_rate": 4.602102102102102e-05,
+      "loss": 0.0613397216796875,
+      "step": 4400
+    },
+    {
+      "epoch": 0.16281341582546402,
+      "grad_norm": 1.0726768970489502,
+      "learning_rate": 4.5930569123340214e-05,
+      "loss": 0.07220725536346435,
+      "step": 4500
+    },
+    {
+      "epoch": 0.16643149173269656,
+      "grad_norm": 0.48238834738731384,
+      "learning_rate": 4.58401172256594e-05,
+      "loss": 0.05229937076568603,
+      "step": 4600
+    },
+    {
+      "epoch": 0.1700495676399291,
+      "grad_norm": 0.4427547752857208,
+      "learning_rate": 4.574966532797859e-05,
+      "loss": 0.06027111530303955,
+      "step": 4700
+    },
+    {
+      "epoch": 0.17366764354716163,
+      "grad_norm": 0.44010627269744873,
+      "learning_rate": 4.565921343029777e-05,
+      "loss": 0.06117689609527588,
+      "step": 4800
+    },
+    {
+      "epoch": 0.17728571945439414,
+      "grad_norm": 0.26065585017204285,
+      "learning_rate": 4.5568761532616956e-05,
+      "loss": 0.060817084312438964,
+      "step": 4900
+    },
+    {
+      "epoch": 0.18090379536162668,
+      "grad_norm": 0.41624584794044495,
+      "learning_rate": 4.547830963493615e-05,
+      "loss": 0.06215104579925537,
+      "step": 5000
+    },
+    {
+      "epoch": 0.18090379536162668,
+      "eval_accuracy": 0.9724426137358435,
+      "eval_f1": 0.741559979115958,
+      "eval_loss": 0.12759321928024292,
+      "eval_precision": 0.6822080909213909,
+      "eval_recall": 0.8122231350376133,
+      "eval_runtime": 63.257,
+      "eval_samples_per_second": 316.17,
+      "eval_steps_per_second": 8.79,
+      "step": 5000
+    },
+    {
+      "epoch": 0.18452187126885922,
+      "grad_norm": 1.1262469291687012,
+      "learning_rate": 4.538785773725533e-05,
+      "loss": 0.056777148246765136,
+      "step": 5100
+    },
+    {
+      "epoch": 0.18813994717609175,
+      "grad_norm": 0.44265300035476685,
+      "learning_rate": 4.5297405839574515e-05,
+      "loss": 0.05986386775970459,
+      "step": 5200
+    },
+    {
+      "epoch": 0.1917580230833243,
+      "grad_norm": 0.5468171238899231,
+      "learning_rate": 4.5206953941893706e-05,
+      "loss": 0.05671721935272217,
+      "step": 5300
+    },
+    {
+      "epoch": 0.19537609899055683,
+      "grad_norm": 0.3858329653739929,
+      "learning_rate": 4.511650204421289e-05,
+      "loss": 0.05604006290435791,
+      "step": 5400
+    },
+    {
+      "epoch": 0.19899417489778937,
+      "grad_norm": 1.0813618898391724,
+      "learning_rate": 4.502605014653208e-05,
+      "loss": 0.05299887180328369,
+      "step": 5500
+    },
+    {
+      "epoch": 0.20261225080502188,
+      "grad_norm": 0.7834122776985168,
+      "learning_rate": 4.4935598248851265e-05,
+      "loss": 0.0669465970993042,
+      "step": 5600
+    },
+    {
+      "epoch": 0.2062303267122544,
+      "grad_norm": 0.8666114211082458,
+      "learning_rate": 4.484514635117045e-05,
+      "loss": 0.06568387985229492,
+      "step": 5700
+    },
+    {
+      "epoch": 0.20984840261948695,
+      "grad_norm": 0.7354055643081665,
+      "learning_rate": 4.475469445348964e-05,
+      "loss": 0.06354703903198242,
+      "step": 5800
+    },
+    {
+      "epoch": 0.2134664785267195,
+      "grad_norm": 0.3984626829624176,
+      "learning_rate": 4.4664242555808824e-05,
+      "loss": 0.05610593318939209,
+      "step": 5900
+    },
+    {
+      "epoch": 0.21708455443395203,
+      "grad_norm": 0.5307297110557556,
+      "learning_rate": 4.4573790658128014e-05,
+      "loss": 0.058310718536376954,
+      "step": 6000
+    },
+    {
+      "epoch": 0.22070263034118456,
+      "grad_norm": 0.23685064911842346,
+      "learning_rate": 4.44833387604472e-05,
+      "loss": 0.0474505615234375,
+      "step": 6100
+    },
+    {
+      "epoch": 0.2243207062484171,
+      "grad_norm": 0.6271052360534668,
+      "learning_rate": 4.439288686276638e-05,
+      "loss": 0.05871774673461914,
+      "step": 6200
+    },
+    {
+      "epoch": 0.22793878215564964,
+      "grad_norm": 0.6762889623641968,
+      "learning_rate": 4.430243496508557e-05,
+      "loss": 0.05517944812774658,
+      "step": 6300
+    },
+    {
+      "epoch": 0.23155685806288215,
+      "grad_norm": 0.9603418111801147,
+      "learning_rate": 4.421198306740476e-05,
+      "loss": 0.05483291625976563,
+      "step": 6400
+    },
+    {
+      "epoch": 0.23517493397011469,
+      "grad_norm": 0.6032853126525879,
+      "learning_rate": 4.412153116972395e-05,
+      "loss": 0.05903904914855957,
+      "step": 6500
+    },
+    {
+      "epoch": 0.23879300987734722,
+      "grad_norm": 0.40814077854156494,
+      "learning_rate": 4.403107927204313e-05,
+      "loss": 0.05642669677734375,
+      "step": 6600
+    },
+    {
+      "epoch": 0.24241108578457976,
+      "grad_norm": 0.5799020528793335,
+      "learning_rate": 4.3940627374362316e-05,
+      "loss": 0.055092153549194334,
+      "step": 6700
+    },
+    {
+      "epoch": 0.2460291616918123,
+      "grad_norm": 1.0993859767913818,
+      "learning_rate": 4.385017547668151e-05,
+      "loss": 0.054167227745056154,
+      "step": 6800
+    },
+    {
+      "epoch": 0.24964723759904484,
+      "grad_norm": 1.9801974296569824,
+      "learning_rate": 4.375972357900069e-05,
+      "loss": 0.057117671966552735,
+      "step": 6900
+    },
+    {
+      "epoch": 0.25326531350627735,
+      "grad_norm": 0.4046414792537689,
+      "learning_rate": 4.366927168131988e-05,
+      "loss": 0.054672832489013674,
+      "step": 7000
+    },
+    {
+      "epoch": 0.2568833894135099,
+      "grad_norm": 0.41931968927383423,
+      "learning_rate": 4.3578819783639066e-05,
+      "loss": 0.05668231964111328,
+      "step": 7100
+    },
+    {
+      "epoch": 0.2605014653207424,
+      "grad_norm": 0.5075521469116211,
+      "learning_rate": 4.348836788595825e-05,
+      "loss": 0.05900467395782471,
+      "step": 7200
+    },
+    {
+      "epoch": 0.264119541227975,
+      "grad_norm": 1.0615949630737305,
+      "learning_rate": 4.339791598827744e-05,
+      "loss": 0.060022168159484864,
+      "step": 7300
+    },
+    {
+      "epoch": 0.2677376171352075,
+      "grad_norm": 0.6786783337593079,
+      "learning_rate": 4.3307464090596625e-05,
+      "loss": 0.053788251876831054,
+      "step": 7400
+    },
+    {
+      "epoch": 0.27135569304244,
+      "grad_norm": 0.7518507838249207,
+      "learning_rate": 4.321701219291581e-05,
+      "loss": 0.05555037975311279,
+      "step": 7500
+    },
+    {
+      "epoch": 0.27135569304244,
+      "eval_accuracy": 0.9812751684036897,
+      "eval_f1": 0.8064070486745359,
+      "eval_loss": 0.08261791616678238,
+      "eval_precision": 0.7701385325808107,
+      "eval_recall": 0.8462604101225857,
+      "eval_runtime": 62.4561,
+      "eval_samples_per_second": 320.225,
+      "eval_steps_per_second": 8.902,
+      "step": 7500
+    },
+    {
+      "epoch": 0.27497376894967257,
+      "grad_norm": 0.8300764560699463,
+      "learning_rate": 4.3126560295235e-05,
+      "loss": 0.051460466384887694,
+      "step": 7600
+    },
+    {
+      "epoch": 0.2785918448569051,
+      "grad_norm": 1.0100982189178467,
+      "learning_rate": 4.303610839755418e-05,
+      "loss": 0.05660095691680908,
+      "step": 7700
+    },
+    {
+      "epoch": 0.28220992076413765,
+      "grad_norm": 0.5547285676002502,
+      "learning_rate": 4.2945656499873374e-05,
+      "loss": 0.05661679267883301,
+      "step": 7800
+    },
+    {
+      "epoch": 0.28582799667137015,
+      "grad_norm": 0.49258002638816833,
+      "learning_rate": 4.285520460219256e-05,
+      "loss": 0.04981692790985107,
+      "step": 7900
+    },
+    {
+      "epoch": 0.2894460725786027,
+      "grad_norm": 2.1518049240112305,
+      "learning_rate": 4.276475270451174e-05,
+      "loss": 0.04876615524291992,
+      "step": 8000
+    },
+    {
+      "epoch": 0.29306414848583523,
+      "grad_norm": 0.973175048828125,
+      "learning_rate": 4.267430080683093e-05,
+      "loss": 0.0555543327331543,
+      "step": 8100
+    },
+    {
+      "epoch": 0.29668222439306774,
+      "grad_norm": 2.2509944438934326,
+      "learning_rate": 4.258384890915012e-05,
+      "loss": 0.05133993148803711,
+      "step": 8200
+    },
+    {
+      "epoch": 0.3003003003003003,
+      "grad_norm": 1.938225507736206,
+      "learning_rate": 4.249339701146931e-05,
+      "loss": 0.05030904769897461,
+      "step": 8300
+    },
+    {
+      "epoch": 0.3039183762075328,
+      "grad_norm": 0.5656659007072449,
+      "learning_rate": 4.240294511378849e-05,
+      "loss": 0.05507714748382568,
+      "step": 8400
+    },
+    {
+      "epoch": 0.3075364521147654,
+      "grad_norm": 0.7741718888282776,
+      "learning_rate": 4.2312493216107676e-05,
+      "loss": 0.05459506511688232,
+      "step": 8500
+    },
+    {
+      "epoch": 0.3111545280219979,
+      "grad_norm": 0.547379195690155,
+      "learning_rate": 4.2222041318426867e-05,
+      "loss": 0.050563540458679196,
+      "step": 8600
+    },
+    {
+      "epoch": 0.31477260392923045,
+      "grad_norm": 0.5133877396583557,
+      "learning_rate": 4.213158942074605e-05,
+      "loss": 0.05503926753997803,
+      "step": 8700
+    },
+    {
+      "epoch": 0.31839067983646296,
+      "grad_norm": 0.4732136130332947,
+      "learning_rate": 4.204113752306524e-05,
+      "loss": 0.04883493423461914,
+      "step": 8800
+    },
+    {
+      "epoch": 0.32200875574369553,
+      "grad_norm": 0.7309387922286987,
+      "learning_rate": 4.1950685625384425e-05,
+      "loss": 0.0464065933227539,
+      "step": 8900
+    },
+    {
+      "epoch": 0.32562683165092804,
+      "grad_norm": 0.9696952104568481,
+      "learning_rate": 4.186023372770361e-05,
+      "loss": 0.05353004455566406,
+      "step": 9000
+    },
+    {
+      "epoch": 0.32924490755816055,
+      "grad_norm": 0.6350353956222534,
+      "learning_rate": 4.17697818300228e-05,
+      "loss": 0.05357151508331299,
+      "step": 9100
+    },
+    {
+      "epoch": 0.3328629834653931,
+      "grad_norm": 0.5927383899688721,
+      "learning_rate": 4.1679329932341984e-05,
+      "loss": 0.0496389102935791,
+      "step": 9200
+    },
+    {
+      "epoch": 0.3364810593726256,
+      "grad_norm": 0.555016040802002,
+      "learning_rate": 4.1588878034661175e-05,
+      "loss": 0.048683485984802245,
+      "step": 9300
+    },
+    {
+      "epoch": 0.3400991352798582,
+      "grad_norm": 0.33153098821640015,
+      "learning_rate": 4.149842613698036e-05,
+      "loss": 0.049552416801452635,
+      "step": 9400
+    },
+    {
+      "epoch": 0.3437172111870907,
+      "grad_norm": 0.7421421408653259,
+      "learning_rate": 4.140797423929954e-05,
+      "loss": 0.050444388389587404,
+      "step": 9500
+    },
+    {
+      "epoch": 0.34733528709432326,
+      "grad_norm": 0.7501067519187927,
+      "learning_rate": 4.1317522341618734e-05,
+      "loss": 0.05306045532226562,
+      "step": 9600
+    },
+    {
+      "epoch": 0.3509533630015558,
+      "grad_norm": 0.9074022173881531,
+      "learning_rate": 4.122707044393792e-05,
+      "loss": 0.04894153594970703,
+      "step": 9700
+    },
+    {
+      "epoch": 0.3545714389087883,
+      "grad_norm": 0.6082141399383545,
+      "learning_rate": 4.11366185462571e-05,
+      "loss": 0.05211612224578857,
+      "step": 9800
+    },
+    {
+      "epoch": 0.35818951481602085,
+      "grad_norm": 0.6638932824134827,
+      "learning_rate": 4.104616664857629e-05,
+      "loss": 0.05089833736419678,
+      "step": 9900
+    },
+    {
+      "epoch": 0.36180759072325336,
+      "grad_norm": 0.8939893841743469,
+      "learning_rate": 4.095571475089548e-05,
+      "loss": 0.05038036823272705,
+      "step": 10000
+    },
+    {
+      "epoch": 0.36180759072325336,
+      "eval_accuracy": 0.9821651815196725,
+      "eval_f1": 0.8226399325197526,
+      "eval_loss": 0.07629744708538055,
+      "eval_precision": 0.7916120576671035,
+      "eval_recall": 0.8561993588814253,
+      "eval_runtime": 62.5369,
+      "eval_samples_per_second": 319.811,
+      "eval_steps_per_second": 8.891,
+      "step": 10000
+    },
+    {
+      "epoch": 0.3654256666304859,
+      "grad_norm": 0.3776226043701172,
+      "learning_rate": 4.086526285321467e-05,
+      "loss": 0.05038893222808838,
+      "step": 10100
+    },
+    {
+      "epoch": 0.36904374253771843,
+      "grad_norm": 0.29007160663604736,
+      "learning_rate": 4.077481095553385e-05,
+      "loss": 0.05022284507751465,
+      "step": 10200
+    },
+    {
+      "epoch": 0.372661818444951,
+      "grad_norm": 0.2021007239818573,
+      "learning_rate": 4.0684359057853036e-05,
+      "loss": 0.049036202430725095,
+      "step": 10300
+    },
+    {
+      "epoch": 0.3762798943521835,
+      "grad_norm": 0.2728661894798279,
+      "learning_rate": 4.0593907160172226e-05,
+      "loss": 0.05147543907165528,
+      "step": 10400
+    },
+    {
+      "epoch": 0.379897970259416,
+      "grad_norm": 0.6017497181892395,
+      "learning_rate": 4.050345526249141e-05,
+      "loss": 0.052560653686523434,
+      "step": 10500
+    },
+    {
+      "epoch": 0.3835160461666486,
+      "grad_norm": 0.5500878095626831,
+      "learning_rate": 4.0413003364810594e-05,
+      "loss": 0.0445310115814209,
+      "step": 10600
+    },
+    {
+      "epoch": 0.3871341220738811,
+      "grad_norm": 1.6260461807250977,
+      "learning_rate": 4.0322551467129785e-05,
+      "loss": 0.04827467441558838,
+      "step": 10700
+    },
+    {
+      "epoch": 0.39075219798111366,
+      "grad_norm": 1.0797089338302612,
+      "learning_rate": 4.023209956944897e-05,
+      "loss": 0.0508196496963501,
+      "step": 10800
+    },
+    {
+      "epoch": 0.39437027388834617,
+      "grad_norm": 0.33457517623901367,
+      "learning_rate": 4.014164767176816e-05,
+      "loss": 0.04953153133392334,
+      "step": 10900
+    },
+    {
+      "epoch": 0.39798834979557873,
+      "grad_norm": 0.5582904815673828,
+      "learning_rate": 4.0051195774087344e-05,
+      "loss": 0.04928678035736084,
+      "step": 11000
+    },
+    {
+      "epoch": 0.40160642570281124,
+      "grad_norm": 0.21949921548366547,
+      "learning_rate": 3.996074387640653e-05,
+      "loss": 0.05192047119140625,
+      "step": 11100
+    },
+    {
+      "epoch": 0.40522450161004375,
+      "grad_norm": 0.7574787139892578,
+      "learning_rate": 3.987029197872572e-05,
+      "loss": 0.049414234161376955,
+      "step": 11200
+    },
+    {
+      "epoch": 0.4088425775172763,
+      "grad_norm": 1.8344570398330688,
+      "learning_rate": 3.97798400810449e-05,
+      "loss": 0.05043137550354004,
+      "step": 11300
+    },
+    {
+      "epoch": 0.4124606534245088,
+      "grad_norm": 0.618725061416626,
+      "learning_rate": 3.968938818336409e-05,
+      "loss": 0.04852957248687744,
+      "step": 11400
+    },
+    {
+      "epoch": 0.4160787293317414,
+      "grad_norm": 0.6515002250671387,
+      "learning_rate": 3.959893628568328e-05,
+      "loss": 0.051465816497802734,
+      "step": 11500
+    },
+    {
+      "epoch": 0.4196968052389739,
+      "grad_norm": 0.6772841215133667,
+      "learning_rate": 3.950848438800246e-05,
+      "loss": 0.05751809120178222,
+      "step": 11600
+    },
+    {
+      "epoch": 0.42331488114620647,
+      "grad_norm": 0.3189091384410858,
+      "learning_rate": 3.941803249032165e-05,
+      "loss": 0.047155842781066895,
+      "step": 11700
+    },
+    {
+      "epoch": 0.426932957053439,
+      "grad_norm": 0.2367490977048874,
+      "learning_rate": 3.9327580592640836e-05,
+      "loss": 0.043431487083435055,
+      "step": 11800
+    },
+    {
+      "epoch": 0.43055103296067154,
+      "grad_norm": 0.38205036520957947,
+      "learning_rate": 3.923712869496002e-05,
+      "loss": 0.04606367588043213,
+      "step": 11900
+    },
+    {
+      "epoch": 0.43416910886790405,
+      "grad_norm": 0.539438009262085,
+      "learning_rate": 3.914667679727921e-05,
+      "loss": 0.04509395122528076,
+      "step": 12000
+    },
+    {
+      "epoch": 0.43778718477513656,
+      "grad_norm": 1.1849830150604248,
+      "learning_rate": 3.9056224899598395e-05,
+      "loss": 0.045330324172973634,
+      "step": 12100
+    },
+    {
+      "epoch": 0.4414052606823691,
+      "grad_norm": 0.6970862746238708,
+      "learning_rate": 3.896577300191758e-05,
+      "loss": 0.04937627792358398,
+      "step": 12200
+    },
+    {
+      "epoch": 0.44502333658960164,
+      "grad_norm": 0.3145708739757538,
+      "learning_rate": 3.887532110423677e-05,
+      "loss": 0.04958348274230957,
+      "step": 12300
+    },
+    {
+      "epoch": 0.4486414124968342,
+      "grad_norm": 1.822594404220581,
+      "learning_rate": 3.8784869206555954e-05,
+      "loss": 0.05177441120147705,
+      "step": 12400
+    },
+    {
+      "epoch": 0.4522594884040667,
+      "grad_norm": 0.3980540335178375,
+      "learning_rate": 3.8694417308875145e-05,
+      "loss": 0.04803945064544678,
+      "step": 12500
+    },
+    {
+      "epoch": 0.4522594884040667,
+      "eval_accuracy": 0.9839402163062075,
+      "eval_f1": 0.8303541577576488,
+      "eval_loss": 0.07028726488351822,
+      "eval_precision": 0.8025429842491283,
+      "eval_recall": 0.8601620515794391,
+      "eval_runtime": 61.9616,
+      "eval_samples_per_second": 322.781,
+      "eval_steps_per_second": 8.973,
+      "step": 12500
+    },
+    {
+      "epoch": 0.4558775643112993,
+      "grad_norm": 2.3516685962677,
+      "learning_rate": 3.860396541119433e-05,
+      "loss": 0.04993240833282471,
+      "step": 12600
+    },
+    {
+      "epoch": 0.4594956402185318,
+      "grad_norm": 0.9219645857810974,
+      "learning_rate": 3.851351351351351e-05,
+      "loss": 0.04464954853057861,
+      "step": 12700
+    },
+    {
+      "epoch": 0.4631137161257643,
+      "grad_norm": 0.7087405920028687,
+      "learning_rate": 3.8423061615832704e-05,
+      "loss": 0.041380634307861326,
+      "step": 12800
+    },
+    {
+      "epoch": 0.46673179203299686,
+      "grad_norm": 0.3233760893344879,
+      "learning_rate": 3.833260971815189e-05,
+      "loss": 0.05234696865081787,
+      "step": 12900
+    },
+    {
+      "epoch": 0.47034986794022937,
+      "grad_norm": 0.31167057156562805,
+      "learning_rate": 3.824215782047107e-05,
+      "loss": 0.04531662464141846,
+      "step": 13000
+    },
+    {
+      "epoch": 0.47396794384746194,
+      "grad_norm": 0.9034203886985779,
+      "learning_rate": 3.815170592279026e-05,
+      "loss": 0.04655809879302979,
+      "step": 13100
+    },
+    {
+      "epoch": 0.47758601975469445,
+      "grad_norm": 0.3943072259426117,
+      "learning_rate": 3.8061254025109447e-05,
+      "loss": 0.0500339937210083,
+      "step": 13200
+    },
+    {
+      "epoch": 0.481204095661927,
+      "grad_norm": 0.9143586158752441,
+      "learning_rate": 3.797080212742864e-05,
+      "loss": 0.04793615818023682,
+      "step": 13300
+    },
+    {
+      "epoch": 0.4848221715691595,
+      "grad_norm": 1.2170947790145874,
+      "learning_rate": 3.788035022974782e-05,
+      "loss": 0.04486670970916748,
+      "step": 13400
+    },
+    {
+      "epoch": 0.48844024747639203,
+      "grad_norm": 0.4851992130279541,
+      "learning_rate": 3.7789898332067005e-05,
+      "loss": 0.0455370569229126,
+      "step": 13500
+    },
+    {
+      "epoch": 0.4920583233836246,
+      "grad_norm": 0.3209129273891449,
+      "learning_rate": 3.7699446434386196e-05,
+      "loss": 0.04612759113311768,
+      "step": 13600
+    },
+    {
+      "epoch": 0.4956763992908571,
+      "grad_norm": 0.6042996644973755,
+      "learning_rate": 3.760899453670538e-05,
+      "loss": 0.04637802600860596,
+      "step": 13700
+    },
+    {
+      "epoch": 0.49929447519808967,
+      "grad_norm": 0.422635018825531,
+      "learning_rate": 3.751854263902457e-05,
+      "loss": 0.050551199913024904,
+      "step": 13800
+    },
+    {
+      "epoch": 0.5029125511053222,
+      "grad_norm": 0.9524370431900024,
+      "learning_rate": 3.7428090741343755e-05,
+      "loss": 0.04804905891418457,
+      "step": 13900
+    },
+    {
+      "epoch": 0.5065306270125547,
+      "grad_norm": 0.8618633151054382,
+      "learning_rate": 3.733763884366294e-05,
+      "loss": 0.0453568172454834,
+      "step": 14000
+    },
+    {
+      "epoch": 0.5101487029197873,
+      "grad_norm": 0.8186506032943726,
+      "learning_rate": 3.724718694598213e-05,
+      "loss": 0.04810242176055908,
+      "step": 14100
+    },
+    {
+      "epoch": 0.5137667788270198,
+      "grad_norm": 0.4649534225463867,
+      "learning_rate": 3.7156735048301314e-05,
+      "loss": 0.041149930953979494,
+      "step": 14200
+    },
+    {
+      "epoch": 0.5173848547342523,
+      "grad_norm": 1.2224235534667969,
+      "learning_rate": 3.70662831506205e-05,
+      "loss": 0.0440573263168335,
+      "step": 14300
+    },
+    {
+      "epoch": 0.5210029306414848,
+      "grad_norm": 1.2368969917297363,
+      "learning_rate": 3.697583125293969e-05,
+      "loss": 0.045858840942382816,
+      "step": 14400
+    },
+    {
+      "epoch": 0.5246210065487174,
+      "grad_norm": 1.4308712482452393,
+      "learning_rate": 3.688537935525887e-05,
+      "loss": 0.0431610631942749,
+      "step": 14500
+    },
+    {
+      "epoch": 0.52823908245595,
+      "grad_norm": 1.7747290134429932,
+      "learning_rate": 3.6794927457578063e-05,
+      "loss": 0.04555936813354492,
+      "step": 14600
+    },
+    {
+      "epoch": 0.5318571583631825,
+      "grad_norm": 0.6626078486442566,
+      "learning_rate": 3.670447555989725e-05,
+      "loss": 0.04809264183044434,
+      "step": 14700
+    },
+    {
+      "epoch": 0.535475234270415,
+      "grad_norm": 0.49305254220962524,
+      "learning_rate": 3.661402366221643e-05,
+      "loss": 0.044796910285949704,
+      "step": 14800
+    },
+    {
+      "epoch": 0.5390933101776475,
+      "grad_norm": 0.5383502840995789,
+      "learning_rate": 3.652357176453562e-05,
+      "loss": 0.04197264194488525,
+      "step": 14900
+    },
+    {
+      "epoch": 0.54271138608488,
+      "grad_norm": 0.9339898824691772,
+      "learning_rate": 3.6433119866854806e-05,
+      "loss": 0.04077723026275635,
+      "step": 15000
+    },
+    {
+      "epoch": 0.54271138608488,
+      "eval_accuracy": 0.9837071542003397,
+      "eval_f1": 0.8344733667950663,
+      "eval_loss": 0.0750078409910202,
+      "eval_precision": 0.8071688796555565,
+      "eval_recall": 0.8636898145910855,
+      "eval_runtime": 62.6857,
+      "eval_samples_per_second": 319.052,
+      "eval_steps_per_second": 8.87,
+      "step": 15000
+    },
+    {
+      "epoch": 0.5463294619921126,
+      "grad_norm": 0.7692775130271912,
+      "learning_rate": 3.634266796917399e-05,
+      "loss": 0.04739581108093262,
+      "step": 15100
+    },
+    {
+      "epoch": 0.5499475378993451,
+      "grad_norm": 1.047753095626831,
+      "learning_rate": 3.625221607149318e-05,
+      "loss": 0.04375821590423584,
+      "step": 15200
+    },
+    {
+      "epoch": 0.5535656138065776,
+      "grad_norm": 0.9720122218132019,
+      "learning_rate": 3.6161764173812365e-05,
+      "loss": 0.0421258020401001,
+      "step": 15300
+    },
+    {
+      "epoch": 0.5571836897138102,
+      "grad_norm": 0.3475571274757385,
+      "learning_rate": 3.6071312276131556e-05,
+      "loss": 0.04756541728973389,
+      "step": 15400
+    },
+    {
+      "epoch": 0.5608017656210428,
+      "grad_norm": 0.8692478537559509,
+      "learning_rate": 3.598086037845074e-05,
+      "loss": 0.04661733150482178,
+      "step": 15500
+    },
+    {
+      "epoch": 0.5644198415282753,
+      "grad_norm": 1.0307046175003052,
+      "learning_rate": 3.5890408480769924e-05,
+      "loss": 0.044859604835510256,
+      "step": 15600
+    },
+    {
+      "epoch": 0.5680379174355078,
+      "grad_norm": 0.654683530330658,
+      "learning_rate": 3.5799956583089115e-05,
+      "loss": 0.04575653076171875,
+      "step": 15700
+    },
+    {
+      "epoch": 0.5716559933427403,
+      "grad_norm": 2.222489356994629,
+      "learning_rate": 3.57095046854083e-05,
+      "loss": 0.04321366310119629,
+      "step": 15800
+    },
+    {
+      "epoch": 0.5752740692499728,
+      "grad_norm": 1.1416321992874146,
+      "learning_rate": 3.561905278772748e-05,
+      "loss": 0.043632102012634275,
+      "step": 15900
+    },
+    {
+      "epoch": 0.5788921451572054,
+      "grad_norm": 1.0366028547286987,
+      "learning_rate": 3.5528600890046673e-05,
+      "loss": 0.04524300575256348,
+      "step": 16000
+    },
+    {
+      "epoch": 0.582510221064438,
+      "grad_norm": 0.7538347840309143,
+      "learning_rate": 3.543814899236586e-05,
+      "loss": 0.04251582622528076,
+      "step": 16100
+    },
+    {
+      "epoch": 0.5861282969716705,
+      "grad_norm": 0.2561816871166229,
+      "learning_rate": 3.534769709468505e-05,
+      "loss": 0.04683804512023926,
+      "step": 16200
+    },
+    {
+      "epoch": 0.589746372878903,
+      "grad_norm": 0.9383835196495056,
+      "learning_rate": 3.525724519700423e-05,
+      "loss": 0.0412297248840332,
+      "step": 16300
+    },
+    {
+      "epoch": 0.5933644487861355,
+      "grad_norm": 0.5518015623092651,
+      "learning_rate": 3.5166793299323416e-05,
+      "loss": 0.0455796480178833,
+      "step": 16400
+    },
+    {
+      "epoch": 0.5969825246933681,
+      "grad_norm": 0.5094241499900818,
+      "learning_rate": 3.507634140164261e-05,
+      "loss": 0.04736936569213867,
+      "step": 16500
+    },
+    {
+      "epoch": 0.6006006006006006,
+      "grad_norm": 0.2816466987133026,
+      "learning_rate": 3.498588950396179e-05,
+      "loss": 0.042105512619018556,
+      "step": 16600
+    },
+    {
+      "epoch": 0.6042186765078331,
+      "grad_norm": 0.4187323749065399,
+      "learning_rate": 3.489543760628098e-05,
+      "loss": 0.044366950988769534,
+      "step": 16700
+    },
+    {
+      "epoch": 0.6078367524150656,
+      "grad_norm": 0.28667891025543213,
+      "learning_rate": 3.4804985708600166e-05,
+      "loss": 0.03723037719726562,
+      "step": 16800
+    },
+    {
+      "epoch": 0.6114548283222982,
+      "grad_norm": 0.3902330994606018,
+      "learning_rate": 3.471453381091935e-05,
+      "loss": 0.042644596099853514,
+      "step": 16900
+    },
+    {
+      "epoch": 0.6150729042295308,
+      "grad_norm": 0.465101033449173,
+      "learning_rate": 3.462408191323854e-05,
+      "loss": 0.04263707160949707,
+      "step": 17000
+    },
+    {
+      "epoch": 0.6186909801367633,
+      "grad_norm": 1.1710171699523926,
+      "learning_rate": 3.4533630015557725e-05,
+      "loss": 0.044122686386108396,
+      "step": 17100
+    },
+    {
+      "epoch": 0.6223090560439958,
+      "grad_norm": 0.4717200696468353,
+      "learning_rate": 3.444317811787691e-05,
+      "loss": 0.042054853439331054,
+      "step": 17200
+    },
+    {
+      "epoch": 0.6259271319512283,
+      "grad_norm": 0.18602319061756134,
+      "learning_rate": 3.43527262201961e-05,
+      "loss": 0.03980276823043823,
+      "step": 17300
+    },
+    {
+      "epoch": 0.6295452078584609,
+      "grad_norm": 2.258084535598755,
+      "learning_rate": 3.4262274322515284e-05,
+      "loss": 0.043924779891967775,
+      "step": 17400
+    },
+    {
+      "epoch": 0.6331632837656934,
+      "grad_norm": 0.5568512082099915,
+      "learning_rate": 3.4171822424834474e-05,
+      "loss": 0.04432165145874024,
+      "step": 17500
+    },
+    {
+      "epoch": 0.6331632837656934,
+      "eval_accuracy": 0.9848981898715126,
+      "eval_f1": 0.8395063656955402,
+      "eval_loss": 0.06519697606563568,
+      "eval_precision": 0.8148625494685449,
+      "eval_recall": 0.8656872694469949,
+      "eval_runtime": 61.9341,
+      "eval_samples_per_second": 322.924,
+      "eval_steps_per_second": 8.977,
+      "step": 17500
+    },
+    {
+      "epoch": 0.6367813596729259,
+      "grad_norm": 0.302276611328125,
+      "learning_rate": 3.408137052715366e-05,
+      "loss": 0.04175849914550781,
+      "step": 17600
+    },
+    {
+      "epoch": 0.6403994355801584,
+      "grad_norm": 0.20687709748744965,
+      "learning_rate": 3.399091862947284e-05,
+      "loss": 0.042713408470153806,
+      "step": 17700
+    },
+    {
+      "epoch": 0.6440175114873911,
+      "grad_norm": 0.5285593271255493,
+      "learning_rate": 3.390046673179203e-05,
+      "loss": 0.041079201698303223,
+      "step": 17800
+    },
+    {
+      "epoch": 0.6476355873946236,
+      "grad_norm": 0.359951913356781,
+      "learning_rate": 3.381001483411122e-05,
+      "loss": 0.047190561294555664,
+      "step": 17900
+    },
+    {
+      "epoch": 0.6512536633018561,
+      "grad_norm": 0.5516379475593567,
+      "learning_rate": 3.371956293643041e-05,
+      "loss": 0.049062256813049314,
+      "step": 18000
+    },
+    {
+      "epoch": 0.6548717392090886,
+      "grad_norm": 0.2408919632434845,
+      "learning_rate": 3.362911103874959e-05,
+      "loss": 0.041800622940063474,
+      "step": 18100
+    },
+    {
+      "epoch": 0.6584898151163211,
+      "grad_norm": 0.5572479963302612,
+      "learning_rate": 3.3538659141068776e-05,
+      "loss": 0.04303212165832519,
+      "step": 18200
+    },
+    {
+      "epoch": 0.6621078910235537,
+      "grad_norm": 1.1610311269760132,
+      "learning_rate": 3.344820724338797e-05,
+      "loss": 0.04213200092315674,
+      "step": 18300
+    },
+    {
+      "epoch": 0.6657259669307862,
+      "grad_norm": 0.945891797542572,
+      "learning_rate": 3.335775534570715e-05,
+      "loss": 0.0419348955154419,
+      "step": 18400
+    },
+    {
+      "epoch": 0.6693440428380187,
+      "grad_norm": 0.40828007459640503,
+      "learning_rate": 3.326730344802634e-05,
+      "loss": 0.039156782627105716,
+      "step": 18500
+    },
+    {
+      "epoch": 0.6729621187452512,
+      "grad_norm": 2.0386905670166016,
+      "learning_rate": 3.3176851550345526e-05,
+      "loss": 0.042091598510742186,
+      "step": 18600
+    },
+    {
+      "epoch": 0.6765801946524838,
+      "grad_norm": 2.043750762939453,
+      "learning_rate": 3.308639965266471e-05,
+      "loss": 0.04341127872467041,
+      "step": 18700
+    },
+    {
+      "epoch": 0.6801982705597164,
+      "grad_norm": 1.103946328163147,
+      "learning_rate": 3.29959477549839e-05,
+      "loss": 0.04109795570373535,
+      "step": 18800
+    },
+    {
+      "epoch": 0.6838163464669489,
+      "grad_norm": 1.6356172561645508,
+      "learning_rate": 3.2905495857303084e-05,
+      "loss": 0.04152417182922363,
+      "step": 18900
+    },
+    {
+      "epoch": 0.6874344223741814,
+      "grad_norm": 0.5166067481040955,
+      "learning_rate": 3.2815043959622275e-05,
+      "loss": 0.03941408634185791,
+      "step": 19000
+    },
+    {
+      "epoch": 0.6910524982814139,
+      "grad_norm": 0.341791570186615,
+      "learning_rate": 3.272459206194146e-05,
+      "loss": 0.04008223056793213,
+      "step": 19100
+    },
+    {
+      "epoch": 0.6946705741886465,
+      "grad_norm": 0.2977801263332367,
+      "learning_rate": 3.263414016426064e-05,
+      "loss": 0.046716113090515134,
+      "step": 19200
+    },
+    {
+      "epoch": 0.698288650095879,
+      "grad_norm": 1.640602707862854,
+      "learning_rate": 3.2543688266579834e-05,
+      "loss": 0.043398504257202146,
+      "step": 19300
+    },
+    {
+      "epoch": 0.7019067260031115,
+      "grad_norm": 0.3690544366836548,
+      "learning_rate": 3.245323636889902e-05,
+      "loss": 0.03948961734771728,
+      "step": 19400
+    },
+    {
+      "epoch": 0.7055248019103441,
+      "grad_norm": 2.460749387741089,
+      "learning_rate": 3.236278447121821e-05,
+      "loss": 0.04185768127441406,
+      "step": 19500
+    },
+    {
+      "epoch": 0.7091428778175766,
+      "grad_norm": 0.5380750894546509,
+      "learning_rate": 3.227233257353739e-05,
+      "loss": 0.040400395393371584,
+      "step": 19600
+    },
+    {
+      "epoch": 0.7127609537248092,
+      "grad_norm": 0.44135797023773193,
+      "learning_rate": 3.218188067585658e-05,
+      "loss": 0.04154191017150879,
+      "step": 19700
+    },
+    {
+      "epoch": 0.7163790296320417,
+      "grad_norm": 0.5789956450462341,
+      "learning_rate": 3.209142877817577e-05,
+      "loss": 0.0443493127822876,
+      "step": 19800
+    },
+    {
+      "epoch": 0.7199971055392742,
+      "grad_norm": 0.32769912481307983,
+      "learning_rate": 3.200097688049495e-05,
+      "loss": 0.03976017475128174,
+      "step": 19900
+    },
+    {
+      "epoch": 0.7236151814465067,
+      "grad_norm": 0.6033921837806702,
+      "learning_rate": 3.1910524982814136e-05,
+      "loss": 0.04033390522003174,
+      "step": 20000
+    },
+    {
+      "epoch": 0.7236151814465067,
+      "eval_accuracy": 0.9859394821797719,
+      "eval_f1": 0.8507431047883741,
+      "eval_loss": 0.064690500497818,
+      "eval_precision": 0.8298106965631318,
+      "eval_recall": 0.8727589039771904,
+      "eval_runtime": 62.6781,
+      "eval_samples_per_second": 319.091,
+      "eval_steps_per_second": 8.871,
+      "step": 20000
+    },
+    {
+      "epoch": 0.7272332573537392,
+      "grad_norm": 0.21106982231140137,
+      "learning_rate": 3.1820073085133327e-05,
+      "loss": 0.0368848705291748,
+      "step": 20100
+    },
+    {
+      "epoch": 0.7308513332609718,
+      "grad_norm": 0.8279436826705933,
+      "learning_rate": 3.172962118745251e-05,
+      "loss": 0.040103306770324705,
+      "step": 20200
+    },
+    {
+      "epoch": 0.7344694091682044,
+      "grad_norm": 0.21994882822036743,
+      "learning_rate": 3.16391692897717e-05,
+      "loss": 0.037559795379638675,
+      "step": 20300
+    },
+    {
+      "epoch": 0.7380874850754369,
+      "grad_norm": 1.8766059875488281,
+      "learning_rate": 3.1548717392090885e-05,
+      "loss": 0.04059103012084961,
+      "step": 20400
+    },
+    {
+      "epoch": 0.7417055609826694,
+      "grad_norm": 0.6307962536811829,
+      "learning_rate": 3.145826549441007e-05,
+      "loss": 0.03980612993240357,
+      "step": 20500
+    },
+    {
+      "epoch": 0.745323636889902,
+      "grad_norm": 0.33936986327171326,
+      "learning_rate": 3.136781359672926e-05,
+      "loss": 0.043472270965576175,
+      "step": 20600
+    },
+    {
+      "epoch": 0.7489417127971345,
+      "grad_norm": 0.7730916738510132,
+      "learning_rate": 3.1277361699048444e-05,
+      "loss": 0.040565075874328616,
+      "step": 20700
+    },
+    {
+      "epoch": 0.752559788704367,
+      "grad_norm": 0.3246110677719116,
+      "learning_rate": 3.1186909801367635e-05,
+      "loss": 0.04017134189605713,
+      "step": 20800
+    },
+    {
+      "epoch": 0.7561778646115995,
+      "grad_norm": 0.8956949710845947,
+      "learning_rate": 3.109645790368682e-05,
+      "loss": 0.04045989513397217,
+      "step": 20900
+    },
+    {
+      "epoch": 0.759795940518832,
+      "grad_norm": 2.5085365772247314,
+      "learning_rate": 3.1006006006006e-05,
+      "loss": 0.0404241943359375,
+      "step": 21000
+    },
+    {
+      "epoch": 0.7634140164260647,
+      "grad_norm": 0.1668255627155304,
+      "learning_rate": 3.0915554108325194e-05,
+      "loss": 0.039553046226501465,
+      "step": 21100
+    },
+    {
+      "epoch": 0.7670320923332972,
+      "grad_norm": 0.39517688751220703,
+      "learning_rate": 3.082510221064438e-05,
+      "loss": 0.04120331764221191,
+      "step": 21200
+    },
+    {
+      "epoch": 0.7706501682405297,
+      "grad_norm": 0.6607240438461304,
+      "learning_rate": 3.073465031296357e-05,
+      "loss": 0.03997873306274414,
+      "step": 21300
+    },
+    {
+      "epoch": 0.7742682441477622,
+      "grad_norm": 0.44018736481666565,
+      "learning_rate": 3.064419841528275e-05,
+      "loss": 0.041695055961608884,
+      "step": 21400
+    },
+    {
+      "epoch": 0.7778863200549948,
+      "grad_norm": 0.15856041014194489,
+      "learning_rate": 3.055374651760194e-05,
+      "loss": 0.04077398300170899,
+      "step": 21500
+    },
+    {
+      "epoch": 0.7815043959622273,
+      "grad_norm": 0.39261528849601746,
+      "learning_rate": 3.0463294619921127e-05,
+      "loss": 0.041572155952453616,
+      "step": 21600
+    },
+    {
+      "epoch": 0.7851224718694598,
+      "grad_norm": 0.28265002369880676,
+      "learning_rate": 3.0372842722240315e-05,
+      "loss": 0.045727620124816896,
+      "step": 21700
+    },
+    {
+      "epoch": 0.7887405477766923,
+      "grad_norm": 0.6709412336349487,
+      "learning_rate": 3.0282390824559502e-05,
+      "loss": 0.04259458065032959,
+      "step": 21800
+    },
+    {
+      "epoch": 0.7923586236839248,
+      "grad_norm": 0.24202914535999298,
+      "learning_rate": 3.0191938926878686e-05,
+      "loss": 0.03839920997619629,
+      "step": 21900
+    },
+    {
+      "epoch": 0.7959766995911575,
+      "grad_norm": 0.4965508282184601,
+      "learning_rate": 3.0101487029197874e-05,
+      "loss": 0.03700316905975342,
+      "step": 22000
+    },
+    {
+      "epoch": 0.79959477549839,
+      "grad_norm": 0.596442461013794,
+      "learning_rate": 3.001103513151706e-05,
+      "loss": 0.04116812229156494,
+      "step": 22100
+    },
+    {
+      "epoch": 0.8032128514056225,
+      "grad_norm": 0.5273512601852417,
+      "learning_rate": 2.992058323383625e-05,
+      "loss": 0.04079509735107422,
+      "step": 22200
+    },
+    {
+      "epoch": 0.806830927312855,
+      "grad_norm": 0.24124516546726227,
+      "learning_rate": 2.9830131336155432e-05,
+      "loss": 0.03795903921127319,
+      "step": 22300
+    },
+    {
+      "epoch": 0.8104490032200875,
+      "grad_norm": 0.46343305706977844,
+      "learning_rate": 2.973967943847462e-05,
+      "loss": 0.038403522968292234,
+      "step": 22400
+    },
+    {
+      "epoch": 0.8140670791273201,
+      "grad_norm": 0.2311462014913559,
+      "learning_rate": 2.9649227540793807e-05,
+      "loss": 0.04132327079772949,
+      "step": 22500
+    },
+    {
+      "epoch": 0.8140670791273201,
+      "eval_accuracy": 0.9865150342336365,
+      "eval_f1": 0.8464219002621376,
+      "eval_loss": 0.05898759886622429,
+      "eval_precision": 0.8253309864544272,
+      "eval_recall": 0.8686190177032491,
+      "eval_runtime": 62.4843,
+      "eval_samples_per_second": 320.08,
+      "eval_steps_per_second": 8.898,
+      "step": 22500
+    },
+    {
+      "epoch": 0.8176851550345526,
+      "grad_norm": 0.6530361175537109,
+      "learning_rate": 2.9558775643112995e-05,
+      "loss": 0.04163932323455811,
+      "step": 22600
+    },
+    {
+      "epoch": 0.8213032309417851,
+      "grad_norm": 1.38533353805542,
+      "learning_rate": 2.946832374543218e-05,
+      "loss": 0.03626733779907226,
+      "step": 22700
+    },
+    {
+      "epoch": 0.8249213068490177,
+      "grad_norm": 1.6181460618972778,
+      "learning_rate": 2.9377871847751366e-05,
+      "loss": 0.03692409038543701,
+      "step": 22800
+    },
+    {
+      "epoch": 0.8285393827562503,
+      "grad_norm": 6.322599411010742,
+      "learning_rate": 2.9287419950070554e-05,
+      "loss": 0.03785946369171143,
+      "step": 22900
+    },
+    {
+      "epoch": 0.8321574586634828,
+      "grad_norm": 0.24266965687274933,
+      "learning_rate": 2.919696805238974e-05,
+      "loss": 0.03527719974517822,
+      "step": 23000
+    },
+    {
+      "epoch": 0.8357755345707153,
+      "grad_norm": 0.41426071524620056,
+      "learning_rate": 2.910651615470893e-05,
+      "loss": 0.0348510479927063,
+      "step": 23100
+    },
+    {
+      "epoch": 0.8393936104779478,
+      "grad_norm": 0.3566010892391205,
+      "learning_rate": 2.9016064257028112e-05,
+      "loss": 0.03639560461044312,
+      "step": 23200
+    },
+    {
+      "epoch": 0.8430116863851803,
+      "grad_norm": 0.14937593042850494,
+      "learning_rate": 2.89256123593473e-05,
+      "loss": 0.033641955852508544,
+      "step": 23300
+    },
+    {
+      "epoch": 0.8466297622924129,
+      "grad_norm": 0.5473237037658691,
+      "learning_rate": 2.8835160461666487e-05,
+      "loss": 0.03712946176528931,
+      "step": 23400
+    },
+    {
+      "epoch": 0.8502478381996454,
+      "grad_norm": 0.3679254949092865,
+      "learning_rate": 2.874470856398567e-05,
+      "loss": 0.03785475969314575,
+      "step": 23500
+    },
+    {
+      "epoch": 0.853865914106878,
+      "grad_norm": 0.20851418375968933,
+      "learning_rate": 2.8654256666304862e-05,
+      "loss": 0.04206960201263428,
+      "step": 23600
+    },
+    {
+      "epoch": 0.8574839900141105,
+      "grad_norm": 0.22139862179756165,
+      "learning_rate": 2.8563804768624046e-05,
+      "loss": 0.03989522218704224,
+      "step": 23700
+    },
+    {
+      "epoch": 0.8611020659213431,
+      "grad_norm": 0.14680643379688263,
+      "learning_rate": 2.8473352870943233e-05,
+      "loss": 0.03717276811599732,
+      "step": 23800
+    },
+    {
+      "epoch": 0.8647201418285756,
+      "grad_norm": 0.2279856950044632,
+      "learning_rate": 2.838290097326242e-05,
+      "loss": 0.039047441482543944,
+      "step": 23900
+    },
+    {
+      "epoch": 0.8683382177358081,
+      "grad_norm": 1.1088160276412964,
+      "learning_rate": 2.8292449075581605e-05,
+      "loss": 0.03408738613128662,
+      "step": 24000
+    },
+    {
+      "epoch": 0.8719562936430406,
+      "grad_norm": 0.8532550930976868,
+      "learning_rate": 2.8201997177900796e-05,
+      "loss": 0.036566758155822755,
+      "step": 24100
+    },
+    {
+      "epoch": 0.8755743695502731,
+      "grad_norm": 0.1683458536863327,
+      "learning_rate": 2.811154528021998e-05,
+      "loss": 0.0397763442993164,
+      "step": 24200
+    },
+    {
+      "epoch": 0.8791924454575057,
+      "grad_norm": 0.3468044102191925,
+      "learning_rate": 2.8021093382539164e-05,
+      "loss": 0.036167433261871336,
+      "step": 24300
+    },
+    {
+      "epoch": 0.8828105213647383,
+      "grad_norm": 1.5043731927871704,
+      "learning_rate": 2.7930641484858354e-05,
+      "loss": 0.04083109855651856,
+      "step": 24400
+    },
+    {
+      "epoch": 0.8864285972719708,
+      "grad_norm": 2.7504560947418213,
+      "learning_rate": 2.784018958717754e-05,
+      "loss": 0.039477238655090334,
+      "step": 24500
+    },
+    {
+      "epoch": 0.8900466731792033,
+      "grad_norm": 0.27413201332092285,
+      "learning_rate": 2.7749737689496726e-05,
+      "loss": 0.03859598875045776,
+      "step": 24600
+    },
+    {
+      "epoch": 0.8936647490864358,
+      "grad_norm": 0.4622710645198822,
+      "learning_rate": 2.7659285791815913e-05,
+      "loss": 0.03455983877182007,
+      "step": 24700
+    },
+    {
+      "epoch": 0.8972828249936684,
+      "grad_norm": 1.0147453546524048,
+      "learning_rate": 2.7568833894135097e-05,
+      "loss": 0.03525468587875366,
+      "step": 24800
+    },
+    {
+      "epoch": 0.9009009009009009,
+      "grad_norm": 0.34606319665908813,
+      "learning_rate": 2.7478381996454288e-05,
+      "loss": 0.03580186367034912,
+      "step": 24900
+    },
+    {
+      "epoch": 0.9045189768081334,
+      "grad_norm": 0.3202800750732422,
+      "learning_rate": 2.7387930098773472e-05,
+      "loss": 0.03665663719177246,
+      "step": 25000
+    },
+    {
+      "epoch": 0.9045189768081334,
+      "eval_accuracy": 0.986656714492393,
+      "eval_f1": 0.8509657594381035,
+      "eval_loss": 0.05820872634649277,
+      "eval_precision": 0.8288109453496006,
+      "eval_recall": 0.8743375376536349,
+      "eval_runtime": 62.5862,
+      "eval_samples_per_second": 319.559,
+      "eval_steps_per_second": 8.884,
+      "step": 25000
+    },
+    {
+      "epoch": 0.9081370527153659,
+      "grad_norm": 0.557600736618042,
+      "learning_rate": 2.7297478201092656e-05,
+      "loss": 0.03967963457107544,
+      "step": 25100
+    },
+    {
+      "epoch": 0.9117551286225986,
+      "grad_norm": 0.4092039465904236,
+      "learning_rate": 2.7207026303411847e-05,
+      "loss": 0.03797311782836914,
+      "step": 25200
+    },
+    {
+      "epoch": 0.9153732045298311,
+      "grad_norm": 0.40534520149230957,
+      "learning_rate": 2.711657440573103e-05,
+      "loss": 0.036147847175598144,
+      "step": 25300
+    },
+    {
+      "epoch": 0.9189912804370636,
+      "grad_norm": 0.4325968623161316,
+      "learning_rate": 2.702612250805022e-05,
+      "loss": 0.03767855882644653,
+      "step": 25400
+    },
+    {
+      "epoch": 0.9226093563442961,
+      "grad_norm": 0.25961676239967346,
+      "learning_rate": 2.6935670610369406e-05,
+      "loss": 0.03738126039505005,
+      "step": 25500
+    },
+    {
+      "epoch": 0.9262274322515286,
+      "grad_norm": 0.2495643049478531,
+      "learning_rate": 2.684521871268859e-05,
+      "loss": 0.03809333562850952,
+      "step": 25600
+    },
+    {
+      "epoch": 0.9298455081587612,
+      "grad_norm": 0.20810630917549133,
+      "learning_rate": 2.675476681500778e-05,
+      "loss": 0.03803467035293579,
+      "step": 25700
+    },
+    {
+      "epoch": 0.9334635840659937,
+      "grad_norm": 0.3630845844745636,
+      "learning_rate": 2.6664314917326964e-05,
+      "loss": 0.04232705593109131,
+      "step": 25800
+    },
+    {
+      "epoch": 0.9370816599732262,
+      "grad_norm": 0.6230679154396057,
+      "learning_rate": 2.6573863019646155e-05,
+      "loss": 0.03966914892196655,
+      "step": 25900
+    },
+    {
+      "epoch": 0.9406997358804587,
+      "grad_norm": 0.6846088767051697,
+      "learning_rate": 2.648341112196534e-05,
+      "loss": 0.03988933086395264,
+      "step": 26000
+    },
+    {
+      "epoch": 0.9443178117876913,
+      "grad_norm": 0.29151585698127747,
+      "learning_rate": 2.6392959224284523e-05,
+      "loss": 0.036113507747650146,
+      "step": 26100
+    },
+    {
+      "epoch": 0.9479358876949239,
+      "grad_norm": 0.3652597963809967,
+      "learning_rate": 2.6302507326603714e-05,
+      "loss": 0.03595402717590332,
+      "step": 26200
+    },
+    {
+      "epoch": 0.9515539636021564,
+      "grad_norm": 0.3763394355773926,
+      "learning_rate": 2.6212055428922898e-05,
+      "loss": 0.03632761478424072,
+      "step": 26300
+    },
+    {
+      "epoch": 0.9551720395093889,
+      "grad_norm": 0.16137683391571045,
+      "learning_rate": 2.612160353124209e-05,
+      "loss": 0.03010902166366577,
+      "step": 26400
+    },
+    {
+      "epoch": 0.9587901154166214,
+      "grad_norm": 0.5310078859329224,
+      "learning_rate": 2.6031151633561273e-05,
+      "loss": 0.034855997562408446,
+      "step": 26500
+    },
+    {
+      "epoch": 0.962408191323854,
+      "grad_norm": 0.4904273748397827,
+      "learning_rate": 2.5940699735880457e-05,
+      "loss": 0.03756725311279297,
+      "step": 26600
+    },
+    {
+      "epoch": 0.9660262672310865,
+      "grad_norm": 0.7692480087280273,
+      "learning_rate": 2.5850247838199648e-05,
+      "loss": 0.03645958185195923,
+      "step": 26700
+    },
+    {
+      "epoch": 0.969644343138319,
+      "grad_norm": 0.45624640583992004,
+      "learning_rate": 2.5759795940518832e-05,
+      "loss": 0.037951292991638186,
+      "step": 26800
+    },
+    {
+      "epoch": 0.9732624190455516,
+      "grad_norm": 0.41989752650260925,
+      "learning_rate": 2.5669344042838023e-05,
+      "loss": 0.03396618366241455,
+      "step": 26900
+    },
+    {
+      "epoch": 0.9768804949527841,
+      "grad_norm": 0.5218580961227417,
+      "learning_rate": 2.5578892145157207e-05,
+      "loss": 0.034535303115844726,
+      "step": 27000
+    },
+    {
+      "epoch": 0.9804985708600167,
+      "grad_norm": 0.24635274708271027,
+      "learning_rate": 2.548844024747639e-05,
+      "loss": 0.034599866867065426,
+      "step": 27100
+    },
+    {
+      "epoch": 0.9841166467672492,
+      "grad_norm": 0.8805984258651733,
+      "learning_rate": 2.539798834979558e-05,
+      "loss": 0.0382379937171936,
+      "step": 27200
+    },
+    {
+      "epoch": 0.9877347226744817,
+      "grad_norm": 0.4743868410587311,
+      "learning_rate": 2.5307536452114765e-05,
+      "loss": 0.03450409173965454,
+      "step": 27300
+    },
+    {
+      "epoch": 0.9913527985817142,
+      "grad_norm": 0.4024532735347748,
+      "learning_rate": 2.521708455443395e-05,
+      "loss": 0.032371597290039064,
+      "step": 27400
+    },
+    {
+      "epoch": 0.9949708744889468,
+      "grad_norm": 1.2098551988601685,
+      "learning_rate": 2.512663265675314e-05,
+      "loss": 0.03947657585144043,
+      "step": 27500
+    },
+    {
+      "epoch": 0.9949708744889468,
+      "eval_accuracy": 0.9862055646169487,
+      "eval_f1": 0.8529879572824359,
+      "eval_loss": 0.05825402960181236,
+      "eval_precision": 0.8304042715484363,
+      "eval_recall": 0.8768343562235217,
+      "eval_runtime": 62.2283,
+      "eval_samples_per_second": 321.397,
+      "eval_steps_per_second": 8.935,
+      "step": 27500
+    },
+    {
+      "epoch": 0.9985889503961793,
+      "grad_norm": 0.3243059515953064,
+      "learning_rate": 2.5036180759072324e-05,
+      "loss": 0.03721761703491211,
+      "step": 27600
+    },
+    {
+      "epoch": 1.0022070263034117,
+      "grad_norm": 0.5898327231407166,
+      "learning_rate": 2.494572886139151e-05,
+      "loss": 0.03310096025466919,
+      "step": 27700
+    },
+    {
+      "epoch": 1.0058251022106444,
+      "grad_norm": 0.30443838238716125,
+      "learning_rate": 2.48552769637107e-05,
+      "loss": 0.033098301887512206,
+      "step": 27800
+    },
+    {
+      "epoch": 1.009443178117877,
+      "grad_norm": 0.7985163331031799,
+      "learning_rate": 2.4764825066029886e-05,
+      "loss": 0.031821844577789304,
+      "step": 27900
+    },
+    {
+      "epoch": 1.0130612540251094,
+      "grad_norm": 0.6274137496948242,
+      "learning_rate": 2.4674373168349074e-05,
+      "loss": 0.03217078447341919,
+      "step": 28000
+    },
+    {
+      "epoch": 1.016679329932342,
+      "grad_norm": 0.744652271270752,
+      "learning_rate": 2.4583921270668258e-05,
+      "loss": 0.030337939262390135,
+      "step": 28100
+    },
+    {
+      "epoch": 1.0202974058395746,
+      "grad_norm": 0.20680102705955505,
+      "learning_rate": 2.4493469372987445e-05,
+      "loss": 0.03135863780975342,
+      "step": 28200
+    },
+    {
+      "epoch": 1.023915481746807,
+      "grad_norm": 0.5819505453109741,
+      "learning_rate": 2.4403017475306633e-05,
+      "loss": 0.030997350215911865,
+      "step": 28300
+    },
+    {
+      "epoch": 1.0275335576540396,
+      "grad_norm": 0.8105890154838562,
+      "learning_rate": 2.431256557762582e-05,
+      "loss": 0.029717042446136474,
+      "step": 28400
+    },
+    {
+      "epoch": 1.031151633561272,
+      "grad_norm": 0.4248642325401306,
+      "learning_rate": 2.4222113679945007e-05,
+      "loss": 0.02956360101699829,
+      "step": 28500
+    },
+    {
+      "epoch": 1.0347697094685047,
+      "grad_norm": 0.17442703247070312,
+      "learning_rate": 2.413166178226419e-05,
+      "loss": 0.03415003776550293,
+      "step": 28600
+    },
+    {
+      "epoch": 1.0383877853757373,
+      "grad_norm": 0.3765491843223572,
+      "learning_rate": 2.404120988458338e-05,
+      "loss": 0.03359386682510376,
+      "step": 28700
+    },
+    {
+      "epoch": 1.0420058612829697,
+      "grad_norm": 0.2846165895462036,
+      "learning_rate": 2.3950757986902566e-05,
+      "loss": 0.03219552993774414,
+      "step": 28800
+    },
+    {
+      "epoch": 1.0456239371902023,
+      "grad_norm": 0.6828330755233765,
+      "learning_rate": 2.3860306089221754e-05,
+      "loss": 0.028468940258026123,
+      "step": 28900
+    },
+    {
+      "epoch": 1.0492420130974347,
+      "grad_norm": 0.24457824230194092,
+      "learning_rate": 2.3769854191540938e-05,
+      "loss": 0.03526209592819214,
+      "step": 29000
+    },
+    {
+      "epoch": 1.0528600890046673,
+      "grad_norm": 0.4728795886039734,
+      "learning_rate": 2.3679402293860125e-05,
+      "loss": 0.027564334869384765,
+      "step": 29100
+    },
+    {
+      "epoch": 1.0564781649119,
+      "grad_norm": 0.34912073612213135,
+      "learning_rate": 2.3588950396179312e-05,
+      "loss": 0.03199338912963867,
+      "step": 29200
+    },
+    {
+      "epoch": 1.0600962408191323,
+      "grad_norm": 0.7076539993286133,
+      "learning_rate": 2.34984984984985e-05,
+      "loss": 0.02838871717453003,
+      "step": 29300
+    },
+    {
+      "epoch": 1.063714316726365,
+      "grad_norm": 0.22086426615715027,
+      "learning_rate": 2.3408046600817687e-05,
+      "loss": 0.03132739543914795,
+      "step": 29400
+    },
+    {
+      "epoch": 1.0673323926335974,
+      "grad_norm": 0.4026763439178467,
+      "learning_rate": 2.331759470313687e-05,
+      "loss": 0.030288333892822265,
+      "step": 29500
+    },
+    {
+      "epoch": 1.07095046854083,
+      "grad_norm": 0.6986600160598755,
+      "learning_rate": 2.322714280545606e-05,
+      "loss": 0.027701468467712403,
+      "step": 29600
+    },
+    {
+      "epoch": 1.0745685444480626,
+      "grad_norm": 0.3440704047679901,
+      "learning_rate": 2.3136690907775246e-05,
+      "loss": 0.03199631690979004,
+      "step": 29700
+    },
+    {
+      "epoch": 1.078186620355295,
+      "grad_norm": 0.5154510736465454,
+      "learning_rate": 2.3046239010094434e-05,
+      "loss": 0.03085195779800415,
+      "step": 29800
+    },
+    {
+      "epoch": 1.0818046962625276,
+      "grad_norm": 1.2285401821136475,
+      "learning_rate": 2.295578711241362e-05,
+      "loss": 0.031190474033355713,
+      "step": 29900
+    },
+    {
+      "epoch": 1.08542277216976,
+      "grad_norm": 0.3479061722755432,
+      "learning_rate": 2.2865335214732805e-05,
+      "loss": 0.03375990152359009,
+      "step": 30000
+    },
+    {
+      "epoch": 1.08542277216976,
+      "eval_accuracy": 0.9868820974514447,
+      "eval_f1": 0.8562118190241375,
+      "eval_loss": 0.05674006789922714,
+      "eval_precision": 0.8352508617387974,
+      "eval_recall": 0.8782519048309412,
+      "eval_runtime": 63.2356,
+      "eval_samples_per_second": 316.278,
+      "eval_steps_per_second": 8.793,
+      "step": 30000
+    },
+    {
+      "epoch": 1.0890408480769926,
+      "grad_norm": 0.18956594169139862,
+      "learning_rate": 2.2774883317051992e-05,
+      "loss": 0.027218008041381837,
+      "step": 30100
+    },
+    {
+      "epoch": 1.0926589239842253,
+      "grad_norm": 0.24030227959156036,
+      "learning_rate": 2.268443141937118e-05,
+      "loss": 0.03073176145553589,
+      "step": 30200
+    },
+    {
+      "epoch": 1.0962769998914577,
+      "grad_norm": 0.1687329262495041,
+      "learning_rate": 2.2593979521690367e-05,
+      "loss": 0.033424663543701175,
+      "step": 30300
+    },
+    {
+      "epoch": 1.0998950757986903,
+      "grad_norm": 1.2173426151275635,
+      "learning_rate": 2.250352762400955e-05,
+      "loss": 0.03079766035079956,
+      "step": 30400
+    },
+    {
+      "epoch": 1.103513151705923,
+      "grad_norm": 0.35310184955596924,
+      "learning_rate": 2.241307572632874e-05,
+      "loss": 0.03289975881576538,
+      "step": 30500
+    },
+    {
+      "epoch": 1.1071312276131553,
+      "grad_norm": 0.14718961715698242,
+      "learning_rate": 2.2322623828647926e-05,
+      "loss": 0.03266577005386353,
+      "step": 30600
+    },
+    {
+      "epoch": 1.110749303520388,
+      "grad_norm": 0.29442161321640015,
+      "learning_rate": 2.2232171930967113e-05,
+      "loss": 0.02883612871170044,
+      "step": 30700
+    },
+    {
+      "epoch": 1.1143673794276203,
+      "grad_norm": 0.36244460940361023,
+      "learning_rate": 2.21417200332863e-05,
+      "loss": 0.030666334629058836,
+      "step": 30800
+    },
+    {
+      "epoch": 1.117985455334853,
+      "grad_norm": 0.2421630471944809,
+      "learning_rate": 2.2051268135605485e-05,
+      "loss": 0.02931546211242676,
+      "step": 30900
+    },
+    {
+      "epoch": 1.1216035312420856,
+      "grad_norm": 0.5055842995643616,
+      "learning_rate": 2.1960816237924672e-05,
+      "loss": 0.030934171676635744,
+      "step": 31000
+    },
+    {
+      "epoch": 1.125221607149318,
+      "grad_norm": 0.27207571268081665,
+      "learning_rate": 2.187036434024386e-05,
+      "loss": 0.03155987024307251,
+      "step": 31100
+    },
+    {
+      "epoch": 1.1288396830565506,
+      "grad_norm": 0.5190430879592896,
+      "learning_rate": 2.1779912442563047e-05,
+      "loss": 0.030766298770904543,
+      "step": 31200
+    },
+    {
+      "epoch": 1.132457758963783,
+      "grad_norm": 0.5578451156616211,
+      "learning_rate": 2.168946054488223e-05,
+      "loss": 0.030352199077606203,
+      "step": 31300
+    },
+    {
+      "epoch": 1.1360758348710156,
+      "grad_norm": 0.775244951248169,
+      "learning_rate": 2.159900864720142e-05,
+      "loss": 0.027431459426879884,
+      "step": 31400
+    },
+    {
+      "epoch": 1.1396939107782482,
+      "grad_norm": 0.17452310025691986,
+      "learning_rate": 2.1508556749520606e-05,
+      "loss": 0.02899331569671631,
+      "step": 31500
+    },
+    {
+      "epoch": 1.1433119866854806,
+      "grad_norm": 1.0152820348739624,
+      "learning_rate": 2.1418104851839793e-05,
+      "loss": 0.02969914197921753,
+      "step": 31600
+    },
+    {
+      "epoch": 1.1469300625927132,
+      "grad_norm": 0.21474546194076538,
+      "learning_rate": 2.132765295415898e-05,
+      "loss": 0.03098618268966675,
+      "step": 31700
+    },
+    {
+      "epoch": 1.1505481384999456,
+      "grad_norm": 0.27076786756515503,
+      "learning_rate": 2.1237201056478165e-05,
+      "loss": 0.026145567893981935,
+      "step": 31800
+    },
+    {
+      "epoch": 1.1541662144071783,
+      "grad_norm": 0.20778276026248932,
+      "learning_rate": 2.1146749158797352e-05,
+      "loss": 0.030465993881225586,
+      "step": 31900
+    },
+    {
+      "epoch": 1.1577842903144109,
+      "grad_norm": 0.2573922276496887,
+      "learning_rate": 2.105629726111654e-05,
+      "loss": 0.031988742351531985,
+      "step": 32000
+    },
+    {
+      "epoch": 1.1614023662216433,
+      "grad_norm": 0.33712247014045715,
+      "learning_rate": 2.0965845363435727e-05,
+      "loss": 0.031969892978668216,
+      "step": 32100
+    },
+    {
+      "epoch": 1.165020442128876,
+      "grad_norm": 0.5677493214607239,
+      "learning_rate": 2.0875393465754914e-05,
+      "loss": 0.02892348051071167,
+      "step": 32200
+    },
+    {
+      "epoch": 1.1686385180361083,
+      "grad_norm": 0.19627009332180023,
+      "learning_rate": 2.0784941568074098e-05,
+      "loss": 0.02890573740005493,
+      "step": 32300
+    },
+    {
+      "epoch": 1.172256593943341,
+      "grad_norm": 0.2041957825422287,
+      "learning_rate": 2.0694489670393286e-05,
+      "loss": 0.02606424331665039,
+      "step": 32400
+    },
+    {
+      "epoch": 1.1758746698505735,
+      "grad_norm": 0.36798298358917236,
+      "learning_rate": 2.0604037772712473e-05,
+      "loss": 0.029083385467529296,
+      "step": 32500
+    },
+    {
+      "epoch": 1.1758746698505735,
+      "eval_accuracy": 0.9877625116339074,
+      "eval_f1": 0.8611236096967975,
+      "eval_loss": 0.05370509624481201,
+      "eval_precision": 0.8443082257515248,
+      "eval_recall": 0.8786224004896986,
+      "eval_runtime": 62.1854,
+      "eval_samples_per_second": 321.619,
+      "eval_steps_per_second": 8.941,
+      "step": 32500
+    },
+    {
+      "epoch": 1.179492745757806,
+      "grad_norm": 0.2152443379163742,
+      "learning_rate": 2.051358587503166e-05,
+      "loss": 0.028284170627593995,
+      "step": 32600
+    },
+    {
+      "epoch": 1.1831108216650386,
+      "grad_norm": 0.2933087646961212,
+      "learning_rate": 2.0423133977350845e-05,
+      "loss": 0.034238841533660885,
+      "step": 32700
+    },
+    {
+      "epoch": 1.1867288975722712,
+      "grad_norm": 0.36995938420295715,
+      "learning_rate": 2.0332682079670032e-05,
+      "loss": 0.03170938491821289,
+      "step": 32800
+    },
+    {
+      "epoch": 1.1903469734795036,
+      "grad_norm": 0.7478405833244324,
+      "learning_rate": 2.024223018198922e-05,
+      "loss": 0.029751029014587402,
+      "step": 32900
+    },
+    {
+      "epoch": 1.1939650493867362,
+      "grad_norm": 0.44457152485847473,
+      "learning_rate": 2.0151778284308407e-05,
+      "loss": 0.02949444770812988,
+      "step": 33000
+    },
+    {
+      "epoch": 1.1975831252939686,
+      "grad_norm": 0.4324032664299011,
+      "learning_rate": 2.0061326386627594e-05,
+      "loss": 0.030652081966400145,
+      "step": 33100
+    },
+    {
+      "epoch": 1.2012012012012012,
+      "grad_norm": 1.3409758806228638,
+      "learning_rate": 1.9970874488946778e-05,
+      "loss": 0.02934673547744751,
+      "step": 33200
+    },
+    {
+      "epoch": 1.2048192771084336,
+      "grad_norm": 0.3867700397968292,
+      "learning_rate": 1.9880422591265966e-05,
+      "loss": 0.02774231195449829,
+      "step": 33300
+    },
+    {
+      "epoch": 1.2084373530156662,
+      "grad_norm": 0.1256304383277893,
+      "learning_rate": 1.9789970693585153e-05,
+      "loss": 0.030440127849578856,
+      "step": 33400
+    },
+    {
+      "epoch": 1.2120554289228989,
+      "grad_norm": 0.574845552444458,
+      "learning_rate": 1.969951879590434e-05,
+      "loss": 0.030182530879974367,
+      "step": 33500
+    },
+    {
+      "epoch": 1.2156735048301313,
+      "grad_norm": 0.501304566860199,
+      "learning_rate": 1.9609066898223528e-05,
+      "loss": 0.03053757667541504,
+      "step": 33600
+    },
+    {
+      "epoch": 1.2192915807373639,
+      "grad_norm": 0.1869884878396988,
+      "learning_rate": 1.9518615000542712e-05,
+      "loss": 0.02801114559173584,
+      "step": 33700
+    },
+    {
+      "epoch": 1.2229096566445965,
+      "grad_norm": 0.44489210844039917,
+      "learning_rate": 1.94281631028619e-05,
+      "loss": 0.02709296464920044,
+      "step": 33800
+    },
+    {
+      "epoch": 1.226527732551829,
+      "grad_norm": 0.2928631007671356,
+      "learning_rate": 1.9337711205181087e-05,
+      "loss": 0.033639376163482664,
+      "step": 33900
+    },
+    {
+      "epoch": 1.2301458084590615,
+      "grad_norm": 0.2070285826921463,
+      "learning_rate": 1.9247259307500274e-05,
+      "loss": 0.03141526222229004,
+      "step": 34000
+    },
+    {
+      "epoch": 1.233763884366294,
+      "grad_norm": 0.4693046510219574,
+      "learning_rate": 1.9156807409819458e-05,
+      "loss": 0.029341881275177003,
+      "step": 34100
+    },
+    {
+      "epoch": 1.2373819602735265,
+      "grad_norm": 0.187980055809021,
+      "learning_rate": 1.9066355512138645e-05,
+      "loss": 0.033849341869354246,
+      "step": 34200
+    },
+    {
+      "epoch": 1.2410000361807592,
+      "grad_norm": 0.7411011457443237,
+      "learning_rate": 1.8975903614457833e-05,
+      "loss": 0.027842617034912108,
+      "step": 34300
+    },
+    {
+      "epoch": 1.2446181120879916,
+      "grad_norm": 0.4449065327644348,
+      "learning_rate": 1.888545171677702e-05,
+      "loss": 0.031680150032043455,
+      "step": 34400
+    },
+    {
+      "epoch": 1.2482361879952242,
+      "grad_norm": 0.7327262759208679,
+      "learning_rate": 1.8794999819096208e-05,
+      "loss": 0.02651881694793701,
+      "step": 34500
+    },
+    {
+      "epoch": 1.2518542639024566,
+      "grad_norm": 0.41838428378105164,
+      "learning_rate": 1.870454792141539e-05,
+      "loss": 0.032553679943084717,
+      "step": 34600
+    },
+    {
+      "epoch": 1.2554723398096892,
+      "grad_norm": 0.3279021382331848,
+      "learning_rate": 1.861409602373458e-05,
+      "loss": 0.02605849742889404,
+      "step": 34700
+    },
+    {
+      "epoch": 1.2590904157169218,
+      "grad_norm": 0.23042799532413483,
+      "learning_rate": 1.8523644126053766e-05,
+      "loss": 0.02857684135437012,
+      "step": 34800
+    },
+    {
+      "epoch": 1.2627084916241542,
+      "grad_norm": 0.14856815338134766,
+      "learning_rate": 1.8433192228372954e-05,
+      "loss": 0.030806925296783447,
+      "step": 34900
+    },
+    {
+      "epoch": 1.2663265675313868,
+      "grad_norm": 0.48354101181030273,
+      "learning_rate": 1.8342740330692138e-05,
+      "loss": 0.030027375221252442,
+      "step": 35000
+    },
+    {
+      "epoch": 1.2663265675313868,
+      "eval_accuracy": 0.9877813255436068,
+      "eval_f1": 0.8615969042346098,
+      "eval_loss": 0.05214959755539894,
+      "eval_precision": 0.8434818838343312,
+      "eval_recall": 0.8805070957972906,
+      "eval_runtime": 62.9193,
+      "eval_samples_per_second": 317.867,
+      "eval_steps_per_second": 8.837,
+      "step": 35000
+    },
+    {
+      "epoch": 1.2699446434386195,
+      "grad_norm": 0.13334180414676666,
+      "learning_rate": 1.8252288433011325e-05,
+      "loss": 0.027159340381622314,
+      "step": 35100
+    },
+    {
+      "epoch": 1.2735627193458519,
+      "grad_norm": 0.7394197583198547,
+      "learning_rate": 1.8161836535330513e-05,
+      "loss": 0.03075253963470459,
+      "step": 35200
+    },
+    {
+      "epoch": 1.2771807952530845,
+      "grad_norm": 0.2870982587337494,
+      "learning_rate": 1.80713846376497e-05,
+      "loss": 0.030658049583435057,
+      "step": 35300
+    },
+    {
+      "epoch": 1.2807988711603169,
+      "grad_norm": 0.9762187004089355,
+      "learning_rate": 1.7980932739968887e-05,
+      "loss": 0.031029996871948243,
+      "step": 35400
+    },
+    {
+      "epoch": 1.2844169470675495,
+      "grad_norm": 0.44388410449028015,
+      "learning_rate": 1.789048084228807e-05,
+      "loss": 0.03051720142364502,
+      "step": 35500
+    },
+    {
+      "epoch": 1.288035022974782,
+      "grad_norm": 0.7785915732383728,
+      "learning_rate": 1.780002894460726e-05,
+      "loss": 0.02536651849746704,
+      "step": 35600
+    },
+    {
+      "epoch": 1.2916530988820145,
+      "grad_norm": 0.1702079176902771,
+      "learning_rate": 1.7709577046926446e-05,
+      "loss": 0.030427489280700683,
+      "step": 35700
+    },
+    {
+      "epoch": 1.2952711747892471,
+      "grad_norm": 0.4802360236644745,
+      "learning_rate": 1.7619125149245634e-05,
+      "loss": 0.03049640417098999,
+      "step": 35800
+    },
+    {
+      "epoch": 1.2988892506964795,
+      "grad_norm": 0.40013861656188965,
+      "learning_rate": 1.752867325156482e-05,
+      "loss": 0.030040171146392822,
+      "step": 35900
+    },
+    {
+      "epoch": 1.3025073266037122,
+      "grad_norm": 0.34162065386772156,
+      "learning_rate": 1.7438221353884005e-05,
+      "loss": 0.031596968173980715,
+      "step": 36000
+    },
+    {
+      "epoch": 1.3061254025109448,
+      "grad_norm": 0.34575241804122925,
+      "learning_rate": 1.7347769456203193e-05,
+      "loss": 0.03362387895584106,
+      "step": 36100
+    },
+    {
+      "epoch": 1.3097434784181772,
+      "grad_norm": 0.4098789691925049,
+      "learning_rate": 1.725731755852238e-05,
+      "loss": 0.027526361942291258,
+      "step": 36200
+    },
+    {
+      "epoch": 1.3133615543254098,
+      "grad_norm": 0.35067400336265564,
+      "learning_rate": 1.7166865660841567e-05,
+      "loss": 0.02835451364517212,
+      "step": 36300
+    },
+    {
+      "epoch": 1.3169796302326424,
+      "grad_norm": 0.1685800403356552,
+      "learning_rate": 1.707641376316075e-05,
+      "loss": 0.028891866207122804,
+      "step": 36400
+    },
+    {
+      "epoch": 1.3205977061398748,
+      "grad_norm": 0.32651832699775696,
+      "learning_rate": 1.698596186547994e-05,
+      "loss": 0.026589181423187256,
+      "step": 36500
+    },
+    {
+      "epoch": 1.3242157820471072,
+      "grad_norm": 0.3153350353240967,
+      "learning_rate": 1.6895509967799126e-05,
+      "loss": 0.031108696460723877,
+      "step": 36600
+    },
+    {
+      "epoch": 1.3278338579543398,
+      "grad_norm": 0.4476368725299835,
+      "learning_rate": 1.6805058070118314e-05,
+      "loss": 0.030014872550964355,
+      "step": 36700
+    },
+    {
+      "epoch": 1.3314519338615725,
+      "grad_norm": 0.1972656548023224,
+      "learning_rate": 1.67146061724375e-05,
+      "loss": 0.029410278797149657,
+      "step": 36800
+    },
+    {
+      "epoch": 1.3350700097688049,
+      "grad_norm": 0.7246927618980408,
+      "learning_rate": 1.6624154274756685e-05,
+      "loss": 0.03080254316329956,
+      "step": 36900
+    },
+    {
+      "epoch": 1.3386880856760375,
+      "grad_norm": 0.3670811355113983,
+      "learning_rate": 1.6533702377075872e-05,
+      "loss": 0.02861506223678589,
+      "step": 37000
+    },
+    {
+      "epoch": 1.34230616158327,
+      "grad_norm": 0.22275477647781372,
+      "learning_rate": 1.644325047939506e-05,
+      "loss": 0.0255238938331604,
+      "step": 37100
+    },
+    {
+      "epoch": 1.3459242374905025,
+      "grad_norm": 0.3272339999675751,
+      "learning_rate": 1.6352798581714247e-05,
+      "loss": 0.028979463577270506,
+      "step": 37200
+    },
+    {
+      "epoch": 1.3495423133977351,
+      "grad_norm": 0.5552839040756226,
+      "learning_rate": 1.626234668403343e-05,
+      "loss": 0.028283817768096922,
+      "step": 37300
+    },
+    {
+      "epoch": 1.3531603893049677,
+      "grad_norm": 0.33792686462402344,
+      "learning_rate": 1.617189478635262e-05,
+      "loss": 0.03224069595336914,
+      "step": 37400
+    },
+    {
+      "epoch": 1.3567784652122001,
+      "grad_norm": 1.0481899976730347,
+      "learning_rate": 1.6081442888671806e-05,
+      "loss": 0.02690179109573364,
+      "step": 37500
+    },
+    {
+      "epoch": 1.3567784652122001,
+      "eval_accuracy": 0.9878715555186957,
+      "eval_f1": 0.8683487542236398,
+      "eval_loss": 0.05309534817934036,
+      "eval_precision": 0.851476257567078,
+      "eval_recall": 0.8859034456096264,
+      "eval_runtime": 62.1337,
+      "eval_samples_per_second": 321.887,
+      "eval_steps_per_second": 8.948,
+      "step": 37500
+    },
+    {
+      "epoch": 1.3603965411194328,
+      "grad_norm": 0.20256465673446655,
+      "learning_rate": 1.5990990990990993e-05,
+      "loss": 0.027432169914245606,
+      "step": 37600
+    },
+    {
+      "epoch": 1.3640146170266652,
+      "grad_norm": 0.3237811028957367,
+      "learning_rate": 1.590053909331018e-05,
+      "loss": 0.030464730262756347,
+      "step": 37700
+    },
+    {
+      "epoch": 1.3676326929338978,
+      "grad_norm": 0.31953930854797363,
+      "learning_rate": 1.5810087195629365e-05,
+      "loss": 0.027273902893066405,
+      "step": 37800
+    },
+    {
+      "epoch": 1.3712507688411302,
+      "grad_norm": 0.38057664036750793,
+      "learning_rate": 1.5719635297948552e-05,
+      "loss": 0.0259963059425354,
+      "step": 37900
+    },
+    {
+      "epoch": 1.3748688447483628,
+      "grad_norm": 0.6410769820213318,
+      "learning_rate": 1.562918340026774e-05,
+      "loss": 0.031271641254425046,
+      "step": 38000
+    },
+    {
+      "epoch": 1.3784869206555954,
+      "grad_norm": 0.8330540060997009,
+      "learning_rate": 1.5538731502586927e-05,
+      "loss": 0.02934875011444092,
+      "step": 38100
+    },
+    {
+      "epoch": 1.3821049965628278,
+      "grad_norm": 1.1677355766296387,
+      "learning_rate": 1.5448279604906114e-05,
+      "loss": 0.02971445083618164,
+      "step": 38200
+    },
+    {
+      "epoch": 1.3857230724700604,
+      "grad_norm": 0.4667145609855652,
+      "learning_rate": 1.53578277072253e-05,
+      "loss": 0.02775926113128662,
+      "step": 38300
+    },
+    {
+      "epoch": 1.389341148377293,
+      "grad_norm": 0.4434032440185547,
+      "learning_rate": 1.5267375809544486e-05,
+      "loss": 0.026833882331848146,
+      "step": 38400
+    },
+    {
+      "epoch": 1.3929592242845255,
+      "grad_norm": 0.2564474642276764,
+      "learning_rate": 1.5176923911863672e-05,
+      "loss": 0.02980698347091675,
+      "step": 38500
+    },
+    {
+      "epoch": 1.396577300191758,
+      "grad_norm": 0.43813377618789673,
+      "learning_rate": 1.5086472014182859e-05,
+      "loss": 0.028636832237243653,
+      "step": 38600
+    },
+    {
+      "epoch": 1.4001953760989905,
+      "grad_norm": 0.928669810295105,
+      "learning_rate": 1.4996020116502043e-05,
+      "loss": 0.02784595012664795,
+      "step": 38700
+    },
+    {
+      "epoch": 1.403813452006223,
+      "grad_norm": 1.0816453695297241,
+      "learning_rate": 1.490556821882123e-05,
+      "loss": 0.031624915599823,
+      "step": 38800
+    },
+    {
+      "epoch": 1.4074315279134555,
+      "grad_norm": 1.6790099143981934,
+      "learning_rate": 1.4815116321140418e-05,
+      "loss": 0.02443223476409912,
+      "step": 38900
+    },
+    {
+      "epoch": 1.4110496038206881,
+      "grad_norm": 0.39879387617111206,
+      "learning_rate": 1.4724664423459605e-05,
+      "loss": 0.02753525972366333,
+      "step": 39000
+    },
+    {
+      "epoch": 1.4146676797279207,
+      "grad_norm": 0.6372315883636475,
+      "learning_rate": 1.4634212525778793e-05,
+      "loss": 0.02859419822692871,
+      "step": 39100
+    },
+    {
+      "epoch": 1.4182857556351531,
+      "grad_norm": 0.4357219934463501,
+      "learning_rate": 1.4543760628097977e-05,
+      "loss": 0.02929396152496338,
+      "step": 39200
+    },
+    {
+      "epoch": 1.4219038315423858,
+      "grad_norm": 0.8673311471939087,
+      "learning_rate": 1.4453308730417164e-05,
+      "loss": 0.027733774185180665,
+      "step": 39300
+    },
+    {
+      "epoch": 1.4255219074496184,
+      "grad_norm": 0.31178081035614014,
+      "learning_rate": 1.4362856832736351e-05,
+      "loss": 0.029380517005920412,
+      "step": 39400
+    },
+    {
+      "epoch": 1.4291399833568508,
+      "grad_norm": 0.9862114191055298,
+      "learning_rate": 1.4272404935055539e-05,
+      "loss": 0.02801510810852051,
+      "step": 39500
+    },
+    {
+      "epoch": 1.4327580592640834,
+      "grad_norm": 0.3226287364959717,
+      "learning_rate": 1.4181953037374726e-05,
+      "loss": 0.02600921630859375,
+      "step": 39600
+    },
+    {
+      "epoch": 1.436376135171316,
+      "grad_norm": 1.0932515859603882,
+      "learning_rate": 1.409150113969391e-05,
+      "loss": 0.027818257808685302,
+      "step": 39700
+    },
+    {
+      "epoch": 1.4399942110785484,
+      "grad_norm": 0.4064158797264099,
+      "learning_rate": 1.4001049242013098e-05,
+      "loss": 0.030927972793579103,
+      "step": 39800
+    },
+    {
+      "epoch": 1.443612286985781,
+      "grad_norm": 0.6574753522872925,
+      "learning_rate": 1.3910597344332285e-05,
+      "loss": 0.028972697257995606,
+      "step": 39900
+    },
+    {
+      "epoch": 1.4472303628930134,
+      "grad_norm": 0.24314340949058533,
+      "learning_rate": 1.3820145446651472e-05,
+      "loss": 0.029455924034118654,
+      "step": 40000
+    },
+    {
+      "epoch": 1.4472303628930134,
+      "eval_accuracy": 0.9882140454666924,
+      "eval_f1": 0.8711891990109102,
+      "eval_loss": 0.05167451128363609,
+      "eval_precision": 0.8548262069393198,
+      "eval_recall": 0.8881908535897808,
+      "eval_runtime": 62.5842,
+      "eval_samples_per_second": 319.57,
+      "eval_steps_per_second": 8.884,
+      "step": 40000
+    },
+    {
+      "epoch": 1.450848438800246,
+      "grad_norm": 0.28122034668922424,
+      "learning_rate": 1.3729693548970656e-05,
+      "loss": 0.029821088314056398,
+      "step": 40100
+    },
+    {
+      "epoch": 1.4544665147074785,
+      "grad_norm": 0.45019853115081787,
+      "learning_rate": 1.3639241651289844e-05,
+      "loss": 0.027684724330902098,
+      "step": 40200
+    },
+    {
+      "epoch": 1.458084590614711,
+      "grad_norm": 0.6584652066230774,
+      "learning_rate": 1.3548789753609031e-05,
+      "loss": 0.026381478309631348,
+      "step": 40300
+    },
+    {
+      "epoch": 1.4617026665219437,
+      "grad_norm": 2.1259236335754395,
+      "learning_rate": 1.3458337855928219e-05,
+      "loss": 0.02868267774581909,
+      "step": 40400
+    },
+    {
+      "epoch": 1.465320742429176,
+      "grad_norm": 0.9566027522087097,
+      "learning_rate": 1.3367885958247406e-05,
+      "loss": 0.027485811710357667,
+      "step": 40500
+    },
+    {
+      "epoch": 1.4689388183364087,
+      "grad_norm": 0.9289085268974304,
+      "learning_rate": 1.327743406056659e-05,
+      "loss": 0.030939743518829346,
+      "step": 40600
+    },
+    {
+      "epoch": 1.4725568942436413,
+      "grad_norm": 0.6716954112052917,
+      "learning_rate": 1.3186982162885778e-05,
+      "loss": 0.026526257991790772,
+      "step": 40700
+    },
+    {
+      "epoch": 1.4761749701508737,
+      "grad_norm": 0.26186442375183105,
+      "learning_rate": 1.3096530265204965e-05,
+      "loss": 0.027606160640716554,
+      "step": 40800
+    },
+    {
+      "epoch": 1.4797930460581064,
+      "grad_norm": 0.5962882041931152,
+      "learning_rate": 1.3006078367524152e-05,
+      "loss": 0.03013371229171753,
+      "step": 40900
+    },
+    {
+      "epoch": 1.4834111219653388,
+      "grad_norm": 0.28622719645500183,
+      "learning_rate": 1.2915626469843336e-05,
+      "loss": 0.026788763999938965,
+      "step": 41000
+    },
+    {
+      "epoch": 1.4870291978725714,
+      "grad_norm": 0.2146042138338089,
+      "learning_rate": 1.2825174572162524e-05,
+      "loss": 0.026920742988586426,
+      "step": 41100
+    },
+    {
+      "epoch": 1.4906472737798038,
+      "grad_norm": 0.30449753999710083,
+      "learning_rate": 1.2734722674481711e-05,
+      "loss": 0.028757052421569826,
+      "step": 41200
+    },
+    {
+      "epoch": 1.4942653496870364,
+      "grad_norm": 0.11651007831096649,
+      "learning_rate": 1.2644270776800899e-05,
+      "loss": 0.029123516082763673,
+      "step": 41300
+    },
+    {
+      "epoch": 1.497883425594269,
+      "grad_norm": 3.1146299839019775,
+      "learning_rate": 1.2553818879120086e-05,
+      "loss": 0.028435797691345216,
+      "step": 41400
+    },
+    {
+      "epoch": 1.5015015015015014,
+      "grad_norm": 0.2705380916595459,
+      "learning_rate": 1.2463366981439272e-05,
+      "loss": 0.03229628562927246,
+      "step": 41500
+    },
+    {
+      "epoch": 1.505119577408734,
+      "grad_norm": 0.5641364455223083,
+      "learning_rate": 1.2372915083758457e-05,
+      "loss": 0.02912388801574707,
+      "step": 41600
+    },
+    {
+      "epoch": 1.5087376533159667,
+      "grad_norm": 0.4726872444152832,
+      "learning_rate": 1.2282463186077645e-05,
+      "loss": 0.028761823177337647,
+      "step": 41700
+    },
+    {
+      "epoch": 1.512355729223199,
+      "grad_norm": 2.5604758262634277,
+      "learning_rate": 1.2192011288396832e-05,
+      "loss": 0.02635906219482422,
+      "step": 41800
+    },
+    {
+      "epoch": 1.5159738051304317,
+      "grad_norm": 0.3598019778728485,
+      "learning_rate": 1.2101559390716018e-05,
+      "loss": 0.026577677726745606,
+      "step": 41900
+    },
+    {
+      "epoch": 1.5195918810376643,
+      "grad_norm": 0.31742435693740845,
+      "learning_rate": 1.2011107493035205e-05,
+      "loss": 0.02479785919189453,
+      "step": 42000
+    },
+    {
+      "epoch": 1.5232099569448967,
+      "grad_norm": 1.0102005004882812,
+      "learning_rate": 1.1920655595354391e-05,
+      "loss": 0.028279991149902345,
+      "step": 42100
+    },
+    {
+      "epoch": 1.526828032852129,
+      "grad_norm": 0.4230172038078308,
+      "learning_rate": 1.1830203697673578e-05,
+      "loss": 0.027808871269226074,
+      "step": 42200
+    },
+    {
+      "epoch": 1.530446108759362,
+      "grad_norm": 0.35221824049949646,
+      "learning_rate": 1.1739751799992764e-05,
+      "loss": 0.02666907787322998,
+      "step": 42300
+    },
+    {
+      "epoch": 1.5340641846665943,
+      "grad_norm": 0.37867021560668945,
+      "learning_rate": 1.1649299902311952e-05,
+      "loss": 0.028237838745117188,
+      "step": 42400
+    },
+    {
+      "epoch": 1.5376822605738267,
+      "grad_norm": 1.1692699193954468,
+      "learning_rate": 1.1558848004631137e-05,
+      "loss": 0.027906298637390137,
+      "step": 42500
+    },
+    {
+      "epoch": 1.5376822605738267,
+      "eval_accuracy": 0.9883852904406909,
+      "eval_f1": 0.8713540843735187,
+      "eval_loss": 0.048916082829236984,
+      "eval_precision": 0.8549944962093611,
+      "eval_recall": 0.8883519386588057,
+      "eval_runtime": 62.2278,
+      "eval_samples_per_second": 321.4,
+      "eval_steps_per_second": 8.935,
+      "step": 42500
+    },
+    {
+      "epoch": 1.5413003364810594,
+      "grad_norm": 0.273318886756897,
+      "learning_rate": 1.1468396106950325e-05,
+      "loss": 0.031116650104522706,
+      "step": 42600
+    },
+    {
+      "epoch": 1.544918412388292,
+      "grad_norm": 0.48087653517723083,
+      "learning_rate": 1.1377944209269512e-05,
+      "loss": 0.026544408798217775,
+      "step": 42700
+    },
+    {
+      "epoch": 1.5485364882955244,
+      "grad_norm": 0.7746985554695129,
+      "learning_rate": 1.1287492311588698e-05,
+      "loss": 0.026500403881072998,
+      "step": 42800
+    },
+    {
+      "epoch": 1.552154564202757,
+      "grad_norm": 0.1549975574016571,
+      "learning_rate": 1.1197040413907885e-05,
+      "loss": 0.026587300300598145,
+      "step": 42900
+    },
+    {
+      "epoch": 1.5557726401099896,
+      "grad_norm": 1.972495198249817,
+      "learning_rate": 1.110658851622707e-05,
+      "loss": 0.029258613586425782,
+      "step": 43000
+    },
+    {
+      "epoch": 1.559390716017222,
+      "grad_norm": 0.6956634521484375,
+      "learning_rate": 1.1016136618546258e-05,
+      "loss": 0.026978886127471922,
+      "step": 43100
+    },
+    {
+      "epoch": 1.5630087919244544,
+      "grad_norm": 0.16629020869731903,
+      "learning_rate": 1.0925684720865444e-05,
+      "loss": 0.03226327657699585,
+      "step": 43200
+    },
+    {
+      "epoch": 1.5666268678316873,
+      "grad_norm": 0.37136366963386536,
+      "learning_rate": 1.0835232823184631e-05,
+      "loss": 0.028375396728515623,
+      "step": 43300
+    },
+    {
+      "epoch": 1.5702449437389197,
+      "grad_norm": 0.2561453580856323,
+      "learning_rate": 1.0744780925503819e-05,
+      "loss": 0.027073240280151366,
+      "step": 43400
+    },
+    {
+      "epoch": 1.573863019646152,
+      "grad_norm": 0.42630210518836975,
+      "learning_rate": 1.0654329027823004e-05,
+      "loss": 0.026704757213592528,
+      "step": 43500
+    },
+    {
+      "epoch": 1.5774810955533847,
+      "grad_norm": 0.4090301990509033,
+      "learning_rate": 1.0563877130142192e-05,
+      "loss": 0.02855618476867676,
+      "step": 43600
+    },
+    {
+      "epoch": 1.5810991714606173,
+      "grad_norm": 0.24324025213718414,
+      "learning_rate": 1.0473425232461378e-05,
+      "loss": 0.025224699974060058,
+      "step": 43700
+    },
+    {
+      "epoch": 1.5847172473678497,
+      "grad_norm": 0.4220653772354126,
+      "learning_rate": 1.0382973334780565e-05,
+      "loss": 0.029145328998565673,
+      "step": 43800
+    },
+    {
+      "epoch": 1.5883353232750823,
+      "grad_norm": 0.4333362281322479,
+      "learning_rate": 1.029252143709975e-05,
+      "loss": 0.025774214267730713,
+      "step": 43900
+    },
+    {
+      "epoch": 1.591953399182315,
+      "grad_norm": 0.15959997475147247,
+      "learning_rate": 1.0202069539418938e-05,
+      "loss": 0.026988446712493896,
+      "step": 44000
+    },
+    {
+      "epoch": 1.5955714750895473,
+      "grad_norm": 0.2643369138240814,
+      "learning_rate": 1.0111617641738126e-05,
+      "loss": 0.0258998441696167,
+      "step": 44100
+    },
+    {
+      "epoch": 1.59918955099678,
+      "grad_norm": 0.8528566360473633,
+      "learning_rate": 1.0021165744057311e-05,
+      "loss": 0.02746238708496094,
+      "step": 44200
+    },
+    {
+      "epoch": 1.6028076269040126,
+      "grad_norm": 0.999005138874054,
+      "learning_rate": 9.930713846376499e-06,
+      "loss": 0.028600902557373048,
+      "step": 44300
+    },
+    {
+      "epoch": 1.606425702811245,
+      "grad_norm": 0.6834824681282043,
+      "learning_rate": 9.840261948695684e-06,
+      "loss": 0.028850455284118653,
+      "step": 44400
+    },
+    {
+      "epoch": 1.6100437787184774,
+      "grad_norm": 0.3043724298477173,
+      "learning_rate": 9.749810051014872e-06,
+      "loss": 0.0262698769569397,
+      "step": 44500
+    },
+    {
+      "epoch": 1.6136618546257102,
+      "grad_norm": 0.8399735689163208,
+      "learning_rate": 9.659358153334057e-06,
+      "loss": 0.02827603816986084,
+      "step": 44600
+    },
+    {
+      "epoch": 1.6172799305329426,
+      "grad_norm": 0.9611870646476746,
+      "learning_rate": 9.568906255653245e-06,
+      "loss": 0.02755260467529297,
+      "step": 44700
+    },
+    {
+      "epoch": 1.620898006440175,
+      "grad_norm": 0.23461508750915527,
+      "learning_rate": 9.47845435797243e-06,
+      "loss": 0.0311501145362854,
+      "step": 44800
+    },
+    {
+      "epoch": 1.6245160823474076,
+      "grad_norm": 2.882127046585083,
+      "learning_rate": 9.388002460291618e-06,
+      "loss": 0.029984614849090575,
+      "step": 44900
+    },
+    {
+      "epoch": 1.6281341582546403,
+      "grad_norm": 0.32786279916763306,
+      "learning_rate": 9.297550562610804e-06,
+      "loss": 0.028132951259613036,
+      "step": 45000
+    },
+    {
+      "epoch": 1.6281341582546403,
+      "eval_accuracy": 0.9886770980197016,
+      "eval_f1": 0.8710388819944511,
+      "eval_loss": 0.047967541962862015,
+      "eval_precision": 0.855134094859697,
+      "eval_recall": 0.887546513313681,
+      "eval_runtime": 62.7107,
+      "eval_samples_per_second": 318.925,
+      "eval_steps_per_second": 8.866,
+      "step": 45000
+    },
+    {
+      "epoch": 1.6317522341618727,
+      "grad_norm": 1.6328613758087158,
+      "learning_rate": 9.207098664929991e-06,
+      "loss": 0.028099877834320067,
+      "step": 45100
+    },
+    {
+      "epoch": 1.6353703100691053,
+      "grad_norm": 1.1488419771194458,
+      "learning_rate": 9.116646767249177e-06,
+      "loss": 0.025699715614318847,
+      "step": 45200
+    },
+    {
+      "epoch": 1.638988385976338,
+      "grad_norm": 1.2527875900268555,
+      "learning_rate": 9.026194869568364e-06,
+      "loss": 0.02980081081390381,
+      "step": 45300
+    },
+    {
+      "epoch": 1.6426064618835703,
+      "grad_norm": 0.25659850239753723,
+      "learning_rate": 8.93574297188755e-06,
+      "loss": 0.02849080801010132,
+      "step": 45400
+    },
+    {
+      "epoch": 1.6462245377908027,
+      "grad_norm": 0.24858339130878448,
+      "learning_rate": 8.845291074206737e-06,
+      "loss": 0.02909574508666992,
+      "step": 45500
+    },
+    {
+      "epoch": 1.6498426136980355,
+      "grad_norm": 0.35774946212768555,
+      "learning_rate": 8.754839176525923e-06,
+      "loss": 0.028034112453460693,
+      "step": 45600
+    },
+    {
+      "epoch": 1.653460689605268,
+      "grad_norm": 0.28512680530548096,
+      "learning_rate": 8.66438727884511e-06,
+      "loss": 0.029735114574432373,
+      "step": 45700
+    },
+    {
+      "epoch": 1.6570787655125003,
+      "grad_norm": 0.12049074470996857,
+      "learning_rate": 8.573935381164296e-06,
+      "loss": 0.03128848075866699,
+      "step": 45800
+    },
+    {
+      "epoch": 1.660696841419733,
+      "grad_norm": 0.5767261385917664,
+      "learning_rate": 8.483483483483484e-06,
+      "loss": 0.02762418031692505,
+      "step": 45900
+    },
+    {
+      "epoch": 1.6643149173269656,
+      "grad_norm": 0.12318204343318939,
+      "learning_rate": 8.39303158580267e-06,
+      "loss": 0.026004743576049805,
+      "step": 46000
+    },
+    {
+      "epoch": 1.667932993234198,
+      "grad_norm": 0.311279833316803,
+      "learning_rate": 8.302579688121857e-06,
+      "loss": 0.024458692073822022,
+      "step": 46100
+    },
+    {
+      "epoch": 1.6715510691414306,
+      "grad_norm": 0.2753770351409912,
+      "learning_rate": 8.212127790441042e-06,
+      "loss": 0.026231870651245118,
+      "step": 46200
+    },
+    {
+      "epoch": 1.6751691450486632,
+      "grad_norm": 0.8421895503997803,
+      "learning_rate": 8.12167589276023e-06,
+      "loss": 0.02496417760848999,
+      "step": 46300
+    },
+    {
+      "epoch": 1.6787872209558956,
+      "grad_norm": 0.6493498086929321,
+      "learning_rate": 8.031223995079417e-06,
+      "loss": 0.026742682456970215,
+      "step": 46400
+    },
+    {
+      "epoch": 1.6824052968631282,
+      "grad_norm": 0.3029896318912506,
+      "learning_rate": 7.940772097398603e-06,
+      "loss": 0.024227650165557862,
+      "step": 46500
+    },
+    {
+      "epoch": 1.6860233727703609,
+      "grad_norm": 0.34622183442115784,
+      "learning_rate": 7.85032019971779e-06,
+      "loss": 0.025336668491363526,
+      "step": 46600
+    },
+    {
+      "epoch": 1.6896414486775932,
+      "grad_norm": 1.1520912647247314,
+      "learning_rate": 7.759868302036976e-06,
+      "loss": 0.028549084663391112,
+      "step": 46700
+    },
+    {
+      "epoch": 1.6932595245848256,
+      "grad_norm": 0.11390261352062225,
+      "learning_rate": 7.669416404356163e-06,
+      "loss": 0.025614957809448242,
+      "step": 46800
+    },
+    {
+      "epoch": 1.6968776004920583,
+      "grad_norm": 0.20818683505058289,
+      "learning_rate": 7.57896450667535e-06,
+      "loss": 0.02624866247177124,
+      "step": 46900
+    },
+    {
+      "epoch": 1.700495676399291,
+      "grad_norm": 0.11861401051282883,
+      "learning_rate": 7.488512608994537e-06,
+      "loss": 0.029836065769195556,
+      "step": 47000
+    },
+    {
+      "epoch": 1.7041137523065233,
+      "grad_norm": 0.21509072184562683,
+      "learning_rate": 7.398060711313724e-06,
+      "loss": 0.02764824151992798,
+      "step": 47100
+    },
+    {
+      "epoch": 1.707731828213756,
+      "grad_norm": 0.09410534054040909,
+      "learning_rate": 7.3076088136329105e-06,
+      "loss": 0.026358423233032228,
+      "step": 47200
+    },
+    {
+      "epoch": 1.7113499041209885,
+      "grad_norm": 0.4441370666027069,
+      "learning_rate": 7.217156915952097e-06,
+      "loss": 0.028589205741882326,
+      "step": 47300
+    },
+    {
+      "epoch": 1.714967980028221,
+      "grad_norm": 0.301600843667984,
+      "learning_rate": 7.1267050182712836e-06,
+      "loss": 0.02586300849914551,
+      "step": 47400
+    },
+    {
+      "epoch": 1.7185860559354535,
+      "grad_norm": 0.2969602942466736,
+      "learning_rate": 7.03625312059047e-06,
+      "loss": 0.027719602584838868,
+      "step": 47500
+    },
+    {
+      "epoch": 1.7185860559354535,
+      "eval_accuracy": 0.9887869098191715,
+      "eval_f1": 0.8751810891473175,
+      "eval_loss": 0.04670108109712601,
+      "eval_precision": 0.8604607721046077,
+      "eval_recall": 0.8904138275423251,
+      "eval_runtime": 62.4542,
+      "eval_samples_per_second": 320.234,
+      "eval_steps_per_second": 8.903,
+      "step": 47500
+    },
+    {
+      "epoch": 1.7222041318426862,
+      "grad_norm": 2.922269582748413,
+      "learning_rate": 6.945801222909657e-06,
+      "loss": 0.026613037586212158,
+      "step": 47600
+    },
+    {
+      "epoch": 1.7258222077499186,
+      "grad_norm": 0.3603607714176178,
+      "learning_rate": 6.855349325228843e-06,
+      "loss": 0.02875258445739746,
+      "step": 47700
+    },
+    {
+      "epoch": 1.729440283657151,
+      "grad_norm": 0.17424313724040985,
+      "learning_rate": 6.764897427548031e-06,
+      "loss": 0.028092458248138427,
+      "step": 47800
+    },
+    {
+      "epoch": 1.7330583595643838,
+      "grad_norm": 0.39376911520957947,
+      "learning_rate": 6.674445529867217e-06,
+      "loss": 0.029860684871673582,
+      "step": 47900
+    },
+    {
+      "epoch": 1.7366764354716162,
+      "grad_norm": 0.30766257643699646,
+      "learning_rate": 6.583993632186404e-06,
+      "loss": 0.027765181064605713,
+      "step": 48000
+    },
+    {
+      "epoch": 1.7402945113788486,
+      "grad_norm": 0.4809003472328186,
+      "learning_rate": 6.49354173450559e-06,
+      "loss": 0.025850486755371094,
+      "step": 48100
+    },
+    {
+      "epoch": 1.7439125872860812,
+      "grad_norm": 0.31469446420669556,
+      "learning_rate": 6.403089836824777e-06,
+      "loss": 0.024390408992767332,
+      "step": 48200
+    },
+    {
+      "epoch": 1.7475306631933138,
+      "grad_norm": 0.1946684867143631,
+      "learning_rate": 6.312637939143963e-06,
+      "loss": 0.02534383535385132,
+      "step": 48300
+    },
+    {
+      "epoch": 1.7511487391005462,
+      "grad_norm": 0.31097686290740967,
+      "learning_rate": 6.22218604146315e-06,
+      "loss": 0.02695645809173584,
+      "step": 48400
+    },
+    {
+      "epoch": 1.7547668150077789,
+      "grad_norm": 0.7921291589736938,
+      "learning_rate": 6.1317341437823365e-06,
+      "loss": 0.023772099018096925,
+      "step": 48500
+    },
+    {
+      "epoch": 1.7583848909150115,
+      "grad_norm": 0.3385520577430725,
+      "learning_rate": 6.041282246101523e-06,
+      "loss": 0.024593567848205565,
+      "step": 48600
+    },
+    {
+      "epoch": 1.7620029668222439,
+      "grad_norm": 0.23133955895900726,
+      "learning_rate": 5.95083034842071e-06,
+      "loss": 0.025404906272888182,
+      "step": 48700
+    },
+    {
+      "epoch": 1.7656210427294765,
+      "grad_norm": 0.17175310850143433,
+      "learning_rate": 5.860378450739896e-06,
+      "loss": 0.024191346168518067,
+      "step": 48800
+    },
+    {
+      "epoch": 1.7692391186367091,
+      "grad_norm": 1.453963041305542,
+      "learning_rate": 5.769926553059084e-06,
+      "loss": 0.023371386528015136,
+      "step": 48900
+    },
+    {
+      "epoch": 1.7728571945439415,
+      "grad_norm": 0.4487530291080475,
+      "learning_rate": 5.67947465537827e-06,
+      "loss": 0.024376935958862304,
+      "step": 49000
+    },
+    {
+      "epoch": 1.776475270451174,
+      "grad_norm": 0.17453834414482117,
+      "learning_rate": 5.589022757697457e-06,
+      "loss": 0.027640838623046875,
+      "step": 49100
+    },
+    {
+      "epoch": 1.7800933463584065,
+      "grad_norm": 0.24941837787628174,
+      "learning_rate": 5.498570860016643e-06,
+      "loss": 0.02413508415222168,
+      "step": 49200
+    },
+    {
+      "epoch": 1.7837114222656392,
+      "grad_norm": 0.3545306622982025,
+      "learning_rate": 5.40811896233583e-06,
+      "loss": 0.025269722938537596,
+      "step": 49300
+    },
+    {
+      "epoch": 1.7873294981728716,
+      "grad_norm": 0.21222856640815735,
+      "learning_rate": 5.317667064655016e-06,
+      "loss": 0.02443007230758667,
+      "step": 49400
+    },
+    {
+      "epoch": 1.7909475740801042,
+      "grad_norm": 0.5955353379249573,
+      "learning_rate": 5.227215166974203e-06,
+      "loss": 0.027793030738830566,
+      "step": 49500
+    },
+    {
+      "epoch": 1.7945656499873368,
+      "grad_norm": 1.0362492799758911,
+      "learning_rate": 5.13676326929339e-06,
+      "loss": 0.02576704978942871,
+      "step": 49600
+    },
+    {
+      "epoch": 1.7981837258945692,
+      "grad_norm": 0.2961190938949585,
+      "learning_rate": 5.046311371612577e-06,
+      "loss": 0.027634003162384034,
+      "step": 49700
+    },
+    {
+      "epoch": 1.8018018018018018,
+      "grad_norm": 0.2701990604400635,
+      "learning_rate": 4.9558594739317635e-06,
+      "loss": 0.026762216091156005,
+      "step": 49800
+    },
+    {
+      "epoch": 1.8054198777090344,
+      "grad_norm": 0.3419773280620575,
+      "learning_rate": 4.86540757625095e-06,
+      "loss": 0.028021221160888673,
+      "step": 49900
+    },
+    {
+      "epoch": 1.8090379536162668,
+      "grad_norm": 0.3847455680370331,
+      "learning_rate": 4.7749556785701366e-06,
+      "loss": 0.028925769329071045,
+      "step": 50000
+    },
+    {
+      "epoch": 1.8090379536162668,
+      "eval_accuracy": 0.9891697152879526,
+      "eval_f1": 0.8756019071264223,
+      "eval_loss": 0.04578976333141327,
+      "eval_precision": 0.8598627201292046,
+      "eval_recall": 0.8919280271911596,
+      "eval_runtime": 62.7397,
+      "eval_samples_per_second": 318.777,
+      "eval_steps_per_second": 8.862,
+      "step": 50000
+    },
+    {
+      "epoch": 1.8126560295234992,
+      "grad_norm": 0.12807752192020416,
+      "learning_rate": 4.684503780889323e-06,
+      "loss": 0.024477434158325196,
+      "step": 50100
+    },
+    {
+      "epoch": 1.816274105430732,
+      "grad_norm": 0.5839409828186035,
+      "learning_rate": 4.59405188320851e-06,
+      "loss": 0.029098427295684813,
+      "step": 50200
+    },
+    {
+      "epoch": 1.8198921813379645,
+      "grad_norm": 0.1988334357738495,
+      "learning_rate": 4.503599985527696e-06,
+      "loss": 0.027852838039398194,
+      "step": 50300
+    },
+    {
+      "epoch": 1.8235102572451969,
+      "grad_norm": 1.1250760555267334,
+      "learning_rate": 4.413148087846884e-06,
+      "loss": 0.025283007621765136,
+      "step": 50400
+    },
+    {
+      "epoch": 1.8271283331524295,
+      "grad_norm": 0.3275587558746338,
+      "learning_rate": 4.32269619016607e-06,
+      "loss": 0.0253476619720459,
+      "step": 50500
+    },
+    {
+      "epoch": 1.8307464090596621,
+      "grad_norm": 0.2422463297843933,
+      "learning_rate": 4.232244292485257e-06,
+      "loss": 0.025618109703063965,
+      "step": 50600
+    },
+    {
+      "epoch": 1.8343644849668945,
+      "grad_norm": 0.6434578895568848,
+      "learning_rate": 4.141792394804443e-06,
+      "loss": 0.026464188098907472,
+      "step": 50700
+    },
+    {
+      "epoch": 1.8379825608741271,
+      "grad_norm": 0.16934601962566376,
+      "learning_rate": 4.05134049712363e-06,
+      "loss": 0.025098586082458497,
+      "step": 50800
+    },
+    {
+      "epoch": 1.8416006367813598,
+      "grad_norm": 0.21844395995140076,
+      "learning_rate": 3.9608885994428164e-06,
+      "loss": 0.023906781673431396,
+      "step": 50900
+    },
+    {
+      "epoch": 1.8452187126885922,
+      "grad_norm": 0.2674906253814697,
+      "learning_rate": 3.870436701762003e-06,
+      "loss": 0.026905314922332765,
+      "step": 51000
+    },
+    {
+      "epoch": 1.8488367885958248,
+      "grad_norm": 0.4344836473464966,
+      "learning_rate": 3.77998480408119e-06,
+      "loss": 0.026017348766326904,
+      "step": 51100
+    },
+    {
+      "epoch": 1.8524548645030574,
+      "grad_norm": 0.5953734517097473,
+      "learning_rate": 3.6895329064003765e-06,
+      "loss": 0.02634397745132446,
+      "step": 51200
+    },
+    {
+      "epoch": 1.8560729404102898,
+      "grad_norm": 0.14901016652584076,
+      "learning_rate": 3.599081008719563e-06,
+      "loss": 0.02832331895828247,
+      "step": 51300
+    },
+    {
+      "epoch": 1.8596910163175222,
+      "grad_norm": 0.7816808223724365,
+      "learning_rate": 3.5086291110387496e-06,
+      "loss": 0.026141095161437988,
+      "step": 51400
+    },
+    {
+      "epoch": 1.8633090922247548,
+      "grad_norm": 0.5734632015228271,
+      "learning_rate": 3.418177213357936e-06,
+      "loss": 0.02372182607650757,
+      "step": 51500
+    },
+    {
+      "epoch": 1.8669271681319874,
+      "grad_norm": 0.9664448499679565,
+      "learning_rate": 3.3277253156771227e-06,
+      "loss": 0.024712865352630616,
+      "step": 51600
+    },
+    {
+      "epoch": 1.8705452440392198,
+      "grad_norm": 0.390066921710968,
+      "learning_rate": 3.2372734179963093e-06,
+      "loss": 0.026522459983825682,
+      "step": 51700
+    },
+    {
+      "epoch": 1.8741633199464525,
+      "grad_norm": 0.6472379565238953,
+      "learning_rate": 3.146821520315496e-06,
+      "loss": 0.024525246620178222,
+      "step": 51800
+    },
+    {
+      "epoch": 1.877781395853685,
+      "grad_norm": 0.4985784888267517,
+      "learning_rate": 3.056369622634683e-06,
+      "loss": 0.02446552038192749,
+      "step": 51900
+    },
+    {
+      "epoch": 1.8813994717609175,
+      "grad_norm": 0.22120802104473114,
+      "learning_rate": 2.9659177249538694e-06,
+      "loss": 0.025269200801849367,
+      "step": 52000
+    },
+    {
+      "epoch": 1.88501754766815,
+      "grad_norm": 0.3579547703266144,
+      "learning_rate": 2.8754658272730564e-06,
+      "loss": 0.025214505195617676,
+      "step": 52100
+    },
+    {
+      "epoch": 1.8886356235753827,
+      "grad_norm": 0.7338326573371887,
+      "learning_rate": 2.785013929592243e-06,
+      "loss": 0.02668466329574585,
+      "step": 52200
+    },
+    {
+      "epoch": 1.8922536994826151,
+      "grad_norm": 0.3315567970275879,
+      "learning_rate": 2.6945620319114295e-06,
+      "loss": 0.030078487396240236,
+      "step": 52300
+    },
+    {
+      "epoch": 1.8958717753898475,
+      "grad_norm": 0.35072797536849976,
+      "learning_rate": 2.6041101342306165e-06,
+      "loss": 0.02516920804977417,
+      "step": 52400
+    },
+    {
+      "epoch": 1.8994898512970804,
+      "grad_norm": 0.43289047479629517,
+      "learning_rate": 2.513658236549803e-06,
+      "loss": 0.026839351654052733,
+      "step": 52500
+    },
+    {
+      "epoch": 1.8994898512970804,
+      "eval_accuracy": 0.9891036746253344,
+      "eval_f1": 0.876242095754291,
+      "eval_loss": 0.045680414885282516,
+      "eval_precision": 0.8623029055350209,
+      "eval_recall": 0.89063934663896,
+      "eval_runtime": 62.307,
+      "eval_samples_per_second": 320.991,
+      "eval_steps_per_second": 8.924,
+      "step": 52500
+    },
+    {
+      "epoch": 1.9031079272043128,
+      "grad_norm": 0.4170491099357605,
+      "learning_rate": 2.4232063388689896e-06,
+      "loss": 0.027149310111999513,
+      "step": 52600
+    },
+    {
+      "epoch": 1.9067260031115452,
+      "grad_norm": 0.33568137884140015,
+      "learning_rate": 2.332754441188176e-06,
+      "loss": 0.024306225776672363,
+      "step": 52700
+    },
+    {
+      "epoch": 1.9103440790187778,
+      "grad_norm": 0.831928551197052,
+      "learning_rate": 2.242302543507363e-06,
+      "loss": 0.025090248584747316,
+      "step": 52800
+    },
+    {
+      "epoch": 1.9139621549260104,
+      "grad_norm": 0.2261083424091339,
+      "learning_rate": 2.1518506458265497e-06,
+      "loss": 0.02992173671722412,
+      "step": 52900
+    },
+    {
+      "epoch": 1.9175802308332428,
+      "grad_norm": 0.36420953273773193,
+      "learning_rate": 2.0613987481457362e-06,
+      "loss": 0.026374735832214356,
+      "step": 53000
+    },
+    {
+      "epoch": 1.9211983067404754,
+      "grad_norm": 0.3849758207798004,
+      "learning_rate": 1.970946850464923e-06,
+      "loss": 0.024311881065368652,
+      "step": 53100
+    },
+    {
+      "epoch": 1.924816382647708,
+      "grad_norm": 0.1625661551952362,
+      "learning_rate": 1.8804949527841096e-06,
+      "loss": 0.028159475326538085,
+      "step": 53200
+    },
+    {
+      "epoch": 1.9284344585549404,
+      "grad_norm": 0.10745652765035629,
+      "learning_rate": 1.7900430551032961e-06,
+      "loss": 0.028279855251312255,
+      "step": 53300
+    },
+    {
+      "epoch": 1.932052534462173,
+      "grad_norm": 0.3585937023162842,
+      "learning_rate": 1.6995911574224827e-06,
+      "loss": 0.025097475051879883,
+      "step": 53400
+    },
+    {
+      "epoch": 1.9356706103694057,
+      "grad_norm": 0.3355402648448944,
+      "learning_rate": 1.6091392597416697e-06,
+      "loss": 0.0232719612121582,
+      "step": 53500
+    },
+    {
+      "epoch": 1.939288686276638,
+      "grad_norm": 0.6301077604293823,
+      "learning_rate": 1.5186873620608562e-06,
+      "loss": 0.023976569175720216,
+      "step": 53600
+    },
+    {
+      "epoch": 1.9429067621838705,
+      "grad_norm": 1.720951795578003,
+      "learning_rate": 1.4282354643800428e-06,
+      "loss": 0.027393877506256104,
+      "step": 53700
+    },
+    {
+      "epoch": 1.946524838091103,
+      "grad_norm": 1.0819095373153687,
+      "learning_rate": 1.3377835666992295e-06,
+      "loss": 0.028527204990386964,
+      "step": 53800
+    },
+    {
+      "epoch": 1.9501429139983357,
+      "grad_norm": 0.4960351884365082,
+      "learning_rate": 1.247331669018416e-06,
+      "loss": 0.023636491298675538,
+      "step": 53900
+    },
+    {
+      "epoch": 1.9537609899055681,
+      "grad_norm": 0.6555366516113281,
+      "learning_rate": 1.1568797713376029e-06,
+      "loss": 0.02606668949127197,
+      "step": 54000
+    },
+    {
+      "epoch": 1.9573790658128007,
+      "grad_norm": 0.17520390450954437,
+      "learning_rate": 1.0664278736567894e-06,
+      "loss": 0.024348812103271486,
+      "step": 54100
+    },
+    {
+      "epoch": 1.9609971417200334,
+      "grad_norm": 0.2867375612258911,
+      "learning_rate": 9.75975975975976e-07,
+      "loss": 0.024609763622283936,
+      "step": 54200
+    },
+    {
+      "epoch": 1.9646152176272658,
+      "grad_norm": 0.11981488019227982,
+      "learning_rate": 8.855240782951626e-07,
+      "loss": 0.02563744068145752,
+      "step": 54300
+    },
+    {
+      "epoch": 1.9682332935344984,
+      "grad_norm": 0.25503483414649963,
+      "learning_rate": 7.950721806143494e-07,
+      "loss": 0.026204137802124022,
+      "step": 54400
+    },
+    {
+      "epoch": 1.971851369441731,
+      "grad_norm": 0.23244522511959076,
+      "learning_rate": 7.04620282933536e-07,
+      "loss": 0.0256950044631958,
+      "step": 54500
+    },
+    {
+      "epoch": 1.9754694453489634,
+      "grad_norm": 0.20025278627872467,
+      "learning_rate": 6.141683852527226e-07,
+      "loss": 0.025686397552490234,
+      "step": 54600
+    },
+    {
+      "epoch": 1.9790875212561958,
+      "grad_norm": 0.4756115972995758,
+      "learning_rate": 5.237164875719093e-07,
+      "loss": 0.02578796148300171,
+      "step": 54700
+    },
+    {
+      "epoch": 1.9827055971634286,
+      "grad_norm": 0.27420374751091003,
+      "learning_rate": 4.3326458989109595e-07,
+      "loss": 0.023311092853546142,
+      "step": 54800
+    },
+    {
+      "epoch": 1.986323673070661,
+      "grad_norm": 0.19387075304985046,
+      "learning_rate": 3.4281269221028255e-07,
+      "loss": 0.02670889377593994,
+      "step": 54900
+    },
+    {
+      "epoch": 1.9899417489778934,
+      "grad_norm": 0.726769745349884,
+      "learning_rate": 2.523607945294692e-07,
+      "loss": 0.03058022975921631,
+      "step": 55000
+    },
+    {
+      "epoch": 1.9899417489778934,
+      "eval_accuracy": 0.9892449709267501,
+      "eval_f1": 0.8768783517240833,
+      "eval_loss": 0.0451948419213295,
+      "eval_precision": 0.8626445559677067,
+      "eval_recall": 0.8915897485462072,
+      "eval_runtime": 62.8103,
+      "eval_samples_per_second": 318.419,
+      "eval_steps_per_second": 8.852,
+      "step": 55000
+    },
+    {
+      "epoch": 1.993559824885126,
+      "grad_norm": 0.22022511065006256,
+      "learning_rate": 1.6190889684865588e-07,
+      "loss": 0.026084864139556886,
+      "step": 55100
+    },
+    {
+      "epoch": 1.9971779007923587,
+      "grad_norm": 0.5684672594070435,
+      "learning_rate": 7.145699916784254e-08,
+      "loss": 0.027587156295776367,
+      "step": 55200
+    },
+    {
+      "epoch": 2.0,
+      "step": 55278,
+      "total_flos": 1.9407141577440333e+18,
+      "train_loss": 0.04855243214227653,
+      "train_runtime": 26239.1933,
+      "train_samples_per_second": 303.363,
+      "train_steps_per_second": 2.107
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 55278,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 2500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.9407141577440333e+18,
+  "train_batch_size": 72,
+  "trial_name": null,
+  "trial_params": null
+}