diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,4220 @@
+{
+  "best_global_step": 55000,
+  "best_metric": 0.9045753492836575,
+  "best_model_checkpoint": "./lang-ner-xlmr/checkpoint-55000",
+  "epoch": 2.0,
+  "eval_steps": 2500,
+  "global_step": 55938,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0035753870356466087,
+      "grad_norm": 4.756625652313232,
+      "learning_rate": 4.991150917086775e-05,
+      "loss": 4.384464111328125,
+      "step": 100
+    },
+    {
+      "epoch": 0.007150774071293217,
+      "grad_norm": 2.1916704177856445,
+      "learning_rate": 4.982212449497658e-05,
+      "loss": 0.6957355499267578,
+      "step": 200
+    },
+    {
+      "epoch": 0.010726161106939826,
+      "grad_norm": 3.089632987976074,
+      "learning_rate": 4.973273981908542e-05,
+      "loss": 0.3664897537231445,
+      "step": 300
+    },
+    {
+      "epoch": 0.014301548142586435,
+      "grad_norm": 3.1561801433563232,
+      "learning_rate": 4.964335514319425e-05,
+      "loss": 0.24268556594848634,
+      "step": 400
+    },
+    {
+      "epoch": 0.017876935178233043,
+      "grad_norm": 4.541702747344971,
+      "learning_rate": 4.955397046730309e-05,
+      "loss": 0.18832412719726563,
+      "step": 500
+    },
+    {
+      "epoch": 0.021452322213879653,
+      "grad_norm": 3.958519220352173,
+      "learning_rate": 4.946458579141192e-05,
+      "loss": 0.1659502410888672,
+      "step": 600
+    },
+    {
+      "epoch": 0.025027709249526263,
+      "grad_norm": 2.7777926921844482,
+      "learning_rate": 4.9375201115520756e-05,
+      "loss": 0.1568641757965088,
+      "step": 700
+    },
+    {
+      "epoch": 0.02860309628517287,
+      "grad_norm": 2.150230646133423,
+      "learning_rate": 4.928581643962959e-05,
+      "loss": 0.14989984512329102,
+      "step": 800
+    },
+    {
+      "epoch": 0.032178483320819476,
+      "grad_norm": 1.7589229345321655,
+      "learning_rate": 4.9196431763738424e-05,
+      "loss": 0.13244229316711426,
+      "step": 900
+    },
+    {
+      "epoch": 0.035753870356466086,
+      "grad_norm": 1.2541024684906006,
+      "learning_rate": 4.910704708784726e-05,
+      "loss": 0.1411085033416748,
+      "step": 1000
+    },
+    {
+      "epoch": 0.039329257392112696,
+      "grad_norm": 1.043690800666809,
+      "learning_rate": 4.901766241195609e-05,
+      "loss": 0.12802630424499511,
+      "step": 1100
+    },
+    {
+      "epoch": 0.042904644427759306,
+      "grad_norm": 1.5866156816482544,
+      "learning_rate": 4.892827773606493e-05,
+      "loss": 0.11472611427307129,
+      "step": 1200
+    },
+    {
+      "epoch": 0.046480031463405916,
+      "grad_norm": 2.9468393325805664,
+      "learning_rate": 4.883889306017377e-05,
+      "loss": 0.11635817527770996,
+      "step": 1300
+    },
+    {
+      "epoch": 0.050055418499052526,
+      "grad_norm": 2.623593330383301,
+      "learning_rate": 4.87495083842826e-05,
+      "loss": 0.11469986915588379,
+      "step": 1400
+    },
+    {
+      "epoch": 0.05363080553469913,
+      "grad_norm": 1.0270402431488037,
+      "learning_rate": 4.8660123708391436e-05,
+      "loss": 0.10951638221740723,
+      "step": 1500
+    },
+    {
+      "epoch": 0.05720619257034574,
+      "grad_norm": 0.6011027693748474,
+      "learning_rate": 4.8570739032500274e-05,
+      "loss": 0.1056843090057373,
+      "step": 1600
+    },
+    {
+      "epoch": 0.06078157960599235,
+      "grad_norm": 1.5310850143432617,
+      "learning_rate": 4.8481354356609104e-05,
+      "loss": 0.10531362533569336,
+      "step": 1700
+    },
+    {
+      "epoch": 0.06435696664163895,
+      "grad_norm": 1.9218846559524536,
+      "learning_rate": 4.839196968071794e-05,
+      "loss": 0.10761914253234864,
+      "step": 1800
+    },
+    {
+      "epoch": 0.06793235367728556,
+      "grad_norm": 0.9941307306289673,
+      "learning_rate": 4.830258500482677e-05,
+      "loss": 0.10573001861572266,
+      "step": 1900
+    },
+    {
+      "epoch": 0.07150774071293217,
+      "grad_norm": 1.5511739253997803,
+      "learning_rate": 4.821320032893561e-05,
+      "loss": 0.09843612670898437,
+      "step": 2000
+    },
+    {
+      "epoch": 0.07508312774857878,
+      "grad_norm": 3.8423593044281006,
+      "learning_rate": 4.812381565304445e-05,
+      "loss": 0.09764796257019043,
+      "step": 2100
+    },
+    {
+      "epoch": 0.07865851478422539,
+      "grad_norm": 2.3102476596832275,
+      "learning_rate": 4.803443097715328e-05,
+      "loss": 0.0996187973022461,
+      "step": 2200
+    },
+    {
+      "epoch": 0.082233901819872,
+      "grad_norm": 0.8750975131988525,
+      "learning_rate": 4.7945046301262116e-05,
+      "loss": 0.09662745475769043,
+      "step": 2300
+    },
+    {
+      "epoch": 0.08580928885551861,
+      "grad_norm": 1.4319772720336914,
+      "learning_rate": 4.7855661625370954e-05,
+      "loss": 0.09479823112487792,
+      "step": 2400
+    },
+    {
+      "epoch": 0.08938467589116522,
+      "grad_norm": 1.1493583917617798,
+      "learning_rate": 4.7766276949479785e-05,
+      "loss": 0.09187865257263184,
+      "step": 2500
+    },
+    {
+      "epoch": 0.08938467589116522,
+      "eval_accuracy": 0.97115720940899,
+      "eval_f1": 0.7833422259443614,
+      "eval_loss": 0.12425321340560913,
+      "eval_precision": 0.7387798259806108,
+      "eval_recall": 0.8336256217117557,
+      "eval_runtime": 30.7236,
+      "eval_samples_per_second": 732.336,
+      "eval_steps_per_second": 20.343,
+      "step": 2500
+    },
+    {
+      "epoch": 0.09296006292681183,
+      "grad_norm": 0.4979458749294281,
+      "learning_rate": 4.767689227358862e-05,
+      "loss": 0.0913974666595459,
+      "step": 2600
+    },
+    {
+      "epoch": 0.09653544996245844,
+      "grad_norm": 0.9877503514289856,
+      "learning_rate": 4.758750759769745e-05,
+      "loss": 0.09267548561096191,
+      "step": 2700
+    },
+    {
+      "epoch": 0.10011083699810505,
+      "grad_norm": 1.4115008115768433,
+      "learning_rate": 4.749812292180629e-05,
+      "loss": 0.09185708045959473,
+      "step": 2800
+    },
+    {
+      "epoch": 0.10368622403375165,
+      "grad_norm": 1.2117033004760742,
+      "learning_rate": 4.740873824591512e-05,
+      "loss": 0.09133506774902343,
+      "step": 2900
+    },
+    {
+      "epoch": 0.10726161106939826,
+      "grad_norm": 0.527315080165863,
+      "learning_rate": 4.731935357002396e-05,
+      "loss": 0.08854376792907714,
+      "step": 3000
+    },
+    {
+      "epoch": 0.11083699810504487,
+      "grad_norm": 0.5725809931755066,
+      "learning_rate": 4.722996889413279e-05,
+      "loss": 0.08516644477844239,
+      "step": 3100
+    },
+    {
+      "epoch": 0.11441238514069148,
+      "grad_norm": 1.4227476119995117,
+      "learning_rate": 4.714058421824163e-05,
+      "loss": 0.08871203422546386,
+      "step": 3200
+    },
+    {
+      "epoch": 0.11798777217633809,
+      "grad_norm": 1.2104847431182861,
+      "learning_rate": 4.705119954235046e-05,
+      "loss": 0.0874100399017334,
+      "step": 3300
+    },
+    {
+      "epoch": 0.1215631592119847,
+      "grad_norm": 1.4136381149291992,
+      "learning_rate": 4.6961814866459295e-05,
+      "loss": 0.09060199737548828,
+      "step": 3400
+    },
+    {
+      "epoch": 0.1251385462476313,
+      "grad_norm": 1.7565488815307617,
+      "learning_rate": 4.687243019056813e-05,
+      "loss": 0.09233291625976563,
+      "step": 3500
+    },
+    {
+      "epoch": 0.1287139332832779,
+      "grad_norm": 1.2004791498184204,
+      "learning_rate": 4.6783045514676964e-05,
+      "loss": 0.08300934791564941,
+      "step": 3600
+    },
+    {
+      "epoch": 0.1322893203189245,
+      "grad_norm": 1.7836707830429077,
+      "learning_rate": 4.66936608387858e-05,
+      "loss": 0.09250588417053222,
+      "step": 3700
+    },
+    {
+      "epoch": 0.13586470735457112,
+      "grad_norm": 1.83432137966156,
+      "learning_rate": 4.660427616289463e-05,
+      "loss": 0.08511058807373047,
+      "step": 3800
+    },
+    {
+      "epoch": 0.13944009439021773,
+      "grad_norm": 1.1962814331054688,
+      "learning_rate": 4.651489148700347e-05,
+      "loss": 0.07956169128417968,
+      "step": 3900
+    },
+    {
+      "epoch": 0.14301548142586434,
+      "grad_norm": 1.145377278327942,
+      "learning_rate": 4.642550681111231e-05,
+      "loss": 0.08179279327392579,
+      "step": 4000
+    },
+    {
+      "epoch": 0.14659086846151095,
+      "grad_norm": 4.283623218536377,
+      "learning_rate": 4.633612213522114e-05,
+      "loss": 0.09118062019348144,
+      "step": 4100
+    },
+    {
+      "epoch": 0.15016625549715756,
+      "grad_norm": 2.0267841815948486,
+      "learning_rate": 4.6246737459329975e-05,
+      "loss": 0.0859706974029541,
+      "step": 4200
+    },
+    {
+      "epoch": 0.15374164253280417,
+      "grad_norm": 1.3412806987762451,
+      "learning_rate": 4.615735278343881e-05,
+      "loss": 0.07687939643859863,
+      "step": 4300
+    },
+    {
+      "epoch": 0.15731702956845078,
+      "grad_norm": 1.2748081684112549,
+      "learning_rate": 4.6067968107547644e-05,
+      "loss": 0.0789797306060791,
+      "step": 4400
+    },
+    {
+      "epoch": 0.1608924166040974,
+      "grad_norm": 0.8491079807281494,
+      "learning_rate": 4.597858343165648e-05,
+      "loss": 0.07809987068176269,
+      "step": 4500
+    },
+    {
+      "epoch": 0.164467803639744,
+      "grad_norm": 1.1583634614944458,
+      "learning_rate": 4.588919875576531e-05,
+      "loss": 0.07350683212280273,
+      "step": 4600
+    },
+    {
+      "epoch": 0.1680431906753906,
+      "grad_norm": 0.6579107642173767,
+      "learning_rate": 4.579981407987415e-05,
+      "loss": 0.07875243663787841,
+      "step": 4700
+    },
+    {
+      "epoch": 0.17161857771103722,
+      "grad_norm": 0.9742094278335571,
+      "learning_rate": 4.571042940398299e-05,
+      "loss": 0.08122955322265625,
+      "step": 4800
+    },
+    {
+      "epoch": 0.17519396474668383,
+      "grad_norm": 0.7365472912788391,
+      "learning_rate": 4.562104472809182e-05,
+      "loss": 0.07848617553710938,
+      "step": 4900
+    },
+    {
+      "epoch": 0.17876935178233044,
+      "grad_norm": 4.312972545623779,
+      "learning_rate": 4.5531660052200655e-05,
+      "loss": 0.07981382846832276,
+      "step": 5000
+    },
+    {
+      "epoch": 0.17876935178233044,
+      "eval_accuracy": 0.9774074625381929,
+      "eval_f1": 0.8253539377731214,
+      "eval_loss": 0.09500592201948166,
+      "eval_precision": 0.7928386037396048,
+      "eval_recall": 0.8606503176839261,
+      "eval_runtime": 27.5737,
+      "eval_samples_per_second": 815.994,
+      "eval_steps_per_second": 22.666,
+      "step": 5000
+    },
+    {
+      "epoch": 0.18234473881797705,
+      "grad_norm": 0.7737888097763062,
+      "learning_rate": 4.544227537630949e-05,
+      "loss": 0.07826550960540772,
+      "step": 5100
+    },
+    {
+      "epoch": 0.18592012585362366,
+      "grad_norm": 1.3171430826187134,
+      "learning_rate": 4.5352890700418324e-05,
+      "loss": 0.07463918685913086,
+      "step": 5200
+    },
+    {
+      "epoch": 0.18949551288927027,
+      "grad_norm": 1.445436716079712,
+      "learning_rate": 4.526350602452716e-05,
+      "loss": 0.07105834484100342,
+      "step": 5300
+    },
+    {
+      "epoch": 0.19307089992491688,
+      "grad_norm": 1.4572588205337524,
+      "learning_rate": 4.517412134863599e-05,
+      "loss": 0.07838897705078125,
+      "step": 5400
+    },
+    {
+      "epoch": 0.1966462869605635,
+      "grad_norm": 0.940371572971344,
+      "learning_rate": 4.508473667274482e-05,
+      "loss": 0.07499915599822998,
+      "step": 5500
+    },
+    {
+      "epoch": 0.2002216739962101,
+      "grad_norm": 0.6899816393852234,
+      "learning_rate": 4.499535199685366e-05,
+      "loss": 0.07030135154724121,
+      "step": 5600
+    },
+    {
+      "epoch": 0.20379706103185669,
+      "grad_norm": 1.0485793352127075,
+      "learning_rate": 4.490596732096249e-05,
+      "loss": 0.07988662719726562,
+      "step": 5700
+    },
+    {
+      "epoch": 0.2073724480675033,
+      "grad_norm": 1.5643068552017212,
+      "learning_rate": 4.481658264507133e-05,
+      "loss": 0.06960040092468261,
+      "step": 5800
+    },
+    {
+      "epoch": 0.2109478351031499,
+      "grad_norm": 0.5016098022460938,
+      "learning_rate": 4.4727197969180166e-05,
+      "loss": 0.07297846794128418,
+      "step": 5900
+    },
+    {
+      "epoch": 0.21452322213879652,
+      "grad_norm": 1.0210011005401611,
+      "learning_rate": 4.4637813293289e-05,
+      "loss": 0.07059600353240966,
+      "step": 6000
+    },
+    {
+      "epoch": 0.21809860917444313,
+      "grad_norm": 1.0845718383789062,
+      "learning_rate": 4.4548428617397835e-05,
+      "loss": 0.06686034202575683,
+      "step": 6100
+    },
+    {
+      "epoch": 0.22167399621008974,
+      "grad_norm": 0.8528701663017273,
+      "learning_rate": 4.445904394150667e-05,
+      "loss": 0.06841277122497559,
+      "step": 6200
+    },
+    {
+      "epoch": 0.22524938324573635,
+      "grad_norm": 1.8847771883010864,
+      "learning_rate": 4.43696592656155e-05,
+      "loss": 0.0757840919494629,
+      "step": 6300
+    },
+    {
+      "epoch": 0.22882477028138296,
+      "grad_norm": 2.079796075820923,
+      "learning_rate": 4.428027458972434e-05,
+      "loss": 0.06774754524230957,
+      "step": 6400
+    },
+    {
+      "epoch": 0.23240015731702957,
+      "grad_norm": 1.0023269653320312,
+      "learning_rate": 4.419088991383317e-05,
+      "loss": 0.07408513069152832,
+      "step": 6500
+    },
+    {
+      "epoch": 0.23597554435267618,
+      "grad_norm": 1.2481714487075806,
+      "learning_rate": 4.410150523794201e-05,
+      "loss": 0.07167030811309814,
+      "step": 6600
+    },
+    {
+      "epoch": 0.23955093138832279,
+      "grad_norm": 1.2812612056732178,
+      "learning_rate": 4.4012120562050846e-05,
+      "loss": 0.07096508502960205,
+      "step": 6700
+    },
+    {
+      "epoch": 0.2431263184239694,
+      "grad_norm": 1.1778594255447388,
+      "learning_rate": 4.392273588615968e-05,
+      "loss": 0.06785487651824951,
+      "step": 6800
+    },
+    {
+      "epoch": 0.246701705459616,
+      "grad_norm": 1.0923346281051636,
+      "learning_rate": 4.3833351210268515e-05,
+      "loss": 0.07081903457641602,
+      "step": 6900
+    },
+    {
+      "epoch": 0.2502770924952626,
+      "grad_norm": 0.9252088069915771,
+      "learning_rate": 4.374396653437735e-05,
+      "loss": 0.0647373390197754,
+      "step": 7000
+    },
+    {
+      "epoch": 0.2538524795309092,
+      "grad_norm": 2.192573070526123,
+      "learning_rate": 4.365458185848618e-05,
+      "loss": 0.0676526165008545,
+      "step": 7100
+    },
+    {
+      "epoch": 0.2574278665665558,
+      "grad_norm": 1.6381704807281494,
+      "learning_rate": 4.356519718259502e-05,
+      "loss": 0.07402976512908936,
+      "step": 7200
+    },
+    {
+      "epoch": 0.26100325360220245,
+      "grad_norm": 0.9283214807510376,
+      "learning_rate": 4.347581250670385e-05,
+      "loss": 0.06920474052429199,
+      "step": 7300
+    },
+    {
+      "epoch": 0.264578640637849,
+      "grad_norm": 0.8774147033691406,
+      "learning_rate": 4.338642783081269e-05,
+      "loss": 0.07280929565429688,
+      "step": 7400
+    },
+    {
+      "epoch": 0.26815402767349567,
+      "grad_norm": 1.8515883684158325,
+      "learning_rate": 4.3297043154921526e-05,
+      "loss": 0.07380086898803712,
+      "step": 7500
+    },
+    {
+      "epoch": 0.26815402767349567,
+      "eval_accuracy": 0.9785046625741701,
+      "eval_f1": 0.8438232328500399,
+      "eval_loss": 0.08568981289863586,
+      "eval_precision": 0.8172658575681245,
+      "eval_recall": 0.8721645631486645,
+      "eval_runtime": 27.0275,
+      "eval_samples_per_second": 832.487,
+      "eval_steps_per_second": 23.125,
+      "step": 7500
+    },
+    {
+      "epoch": 0.27172941470914225,
+      "grad_norm": 0.7200068235397339,
+      "learning_rate": 4.320765847903036e-05,
+      "loss": 0.0730604887008667,
+      "step": 7600
+    },
+    {
+      "epoch": 0.2753048017447889,
+      "grad_norm": 0.915267288684845,
+      "learning_rate": 4.3118273803139195e-05,
+      "loss": 0.07050428867340088,
+      "step": 7700
+    },
+    {
+      "epoch": 0.27888018878043547,
+      "grad_norm": 0.7131528854370117,
+      "learning_rate": 4.3028889127248025e-05,
+      "loss": 0.0710810136795044,
+      "step": 7800
+    },
+    {
+      "epoch": 0.2824555758160821,
+      "grad_norm": 1.1002038717269897,
+      "learning_rate": 4.293950445135686e-05,
+      "loss": 0.07342493057250976,
+      "step": 7900
+    },
+    {
+      "epoch": 0.2860309628517287,
+      "grad_norm": 0.9407269954681396,
+      "learning_rate": 4.2850119775465694e-05,
+      "loss": 0.0673301601409912,
+      "step": 8000
+    },
+    {
+      "epoch": 0.2896063498873753,
+      "grad_norm": 2.832193613052368,
+      "learning_rate": 4.276073509957453e-05,
+      "loss": 0.06240209102630615,
+      "step": 8100
+    },
+    {
+      "epoch": 0.2931817369230219,
+      "grad_norm": 0.8768466114997864,
+      "learning_rate": 4.267135042368336e-05,
+      "loss": 0.06878421783447265,
+      "step": 8200
+    },
+    {
+      "epoch": 0.29675712395866854,
+      "grad_norm": 2.6219418048858643,
+      "learning_rate": 4.25819657477922e-05,
+      "loss": 0.06775379657745362,
+      "step": 8300
+    },
+    {
+      "epoch": 0.3003325109943151,
+      "grad_norm": 1.4696264266967773,
+      "learning_rate": 4.249258107190103e-05,
+      "loss": 0.06918183803558349,
+      "step": 8400
+    },
+    {
+      "epoch": 0.30390789802996176,
+      "grad_norm": 0.3726998269557953,
+      "learning_rate": 4.240319639600987e-05,
+      "loss": 0.0662617588043213,
+      "step": 8500
+    },
+    {
+      "epoch": 0.30748328506560835,
+      "grad_norm": 0.7445316314697266,
+      "learning_rate": 4.2313811720118706e-05,
+      "loss": 0.06442654609680176,
+      "step": 8600
+    },
+    {
+      "epoch": 0.311058672101255,
+      "grad_norm": 1.971909761428833,
+      "learning_rate": 4.2224427044227536e-05,
+      "loss": 0.0726364278793335,
+      "step": 8700
+    },
+    {
+      "epoch": 0.31463405913690157,
+      "grad_norm": 1.5563815832138062,
+      "learning_rate": 4.2135042368336374e-05,
+      "loss": 0.06712177753448487,
+      "step": 8800
+    },
+    {
+      "epoch": 0.3182094461725482,
+      "grad_norm": 0.7900974154472351,
+      "learning_rate": 4.204565769244521e-05,
+      "loss": 0.058818936347961426,
+      "step": 8900
+    },
+    {
+      "epoch": 0.3217848332081948,
+      "grad_norm": 1.3865457773208618,
+      "learning_rate": 4.195627301655404e-05,
+      "loss": 0.06370719909667968,
+      "step": 9000
+    },
+    {
+      "epoch": 0.32536022024384137,
+      "grad_norm": 0.34235015511512756,
+      "learning_rate": 4.186688834066288e-05,
+      "loss": 0.06904962062835693,
+      "step": 9100
+    },
+    {
+      "epoch": 0.328935607279488,
+      "grad_norm": 2.1909384727478027,
+      "learning_rate": 4.177750366477171e-05,
+      "loss": 0.06057620048522949,
+      "step": 9200
+    },
+    {
+      "epoch": 0.3325109943151346,
+      "grad_norm": 1.308127760887146,
+      "learning_rate": 4.168811898888055e-05,
+      "loss": 0.06866058826446533,
+      "step": 9300
+    },
+    {
+      "epoch": 0.3360863813507812,
+      "grad_norm": 0.6863975524902344,
+      "learning_rate": 4.1598734312989386e-05,
+      "loss": 0.06358649730682372,
+      "step": 9400
+    },
+    {
+      "epoch": 0.3396617683864278,
+      "grad_norm": 1.1869947910308838,
+      "learning_rate": 4.1509349637098216e-05,
+      "loss": 0.06475292205810547,
+      "step": 9500
+    },
+    {
+      "epoch": 0.34323715542207445,
+      "grad_norm": 1.4386121034622192,
+      "learning_rate": 4.1419964961207054e-05,
+      "loss": 0.06661314010620117,
+      "step": 9600
+    },
+    {
+      "epoch": 0.34681254245772103,
+      "grad_norm": 0.48181113600730896,
+      "learning_rate": 4.133058028531589e-05,
+      "loss": 0.060897083282470704,
+      "step": 9700
+    },
+    {
+      "epoch": 0.35038792949336767,
+      "grad_norm": 0.8885261416435242,
+      "learning_rate": 4.124119560942472e-05,
+      "loss": 0.06239647388458252,
+      "step": 9800
+    },
+    {
+      "epoch": 0.35396331652901425,
+      "grad_norm": 1.2147257328033447,
+      "learning_rate": 4.115181093353356e-05,
+      "loss": 0.06007009029388428,
+      "step": 9900
+    },
+    {
+      "epoch": 0.3575387035646609,
+      "grad_norm": 3.1831276416778564,
+      "learning_rate": 4.106242625764239e-05,
+      "loss": 0.06108261108398438,
+      "step": 10000
+    },
+    {
+      "epoch": 0.3575387035646609,
+      "eval_accuracy": 0.9812439807847978,
+      "eval_f1": 0.8498944390638173,
+      "eval_loss": 0.07966496795415878,
+      "eval_precision": 0.8246859491839411,
+      "eval_recall": 0.8766926372078314,
+      "eval_runtime": 27.6584,
+      "eval_samples_per_second": 813.495,
+      "eval_steps_per_second": 22.597,
+      "step": 10000
+    },
+    {
+      "epoch": 0.36111409060030747,
+      "grad_norm": 1.2144405841827393,
+      "learning_rate": 4.097304158175123e-05,
+      "loss": 0.06152146816253662,
+      "step": 10100
+    },
+    {
+      "epoch": 0.3646894776359541,
+      "grad_norm": 0.777988076210022,
+      "learning_rate": 4.0883656905860066e-05,
+      "loss": 0.06342405319213867,
+      "step": 10200
+    },
+    {
+      "epoch": 0.3682648646716007,
+      "grad_norm": 0.6419842839241028,
+      "learning_rate": 4.0794272229968896e-05,
+      "loss": 0.055976643562316894,
+      "step": 10300
+    },
+    {
+      "epoch": 0.3718402517072473,
+      "grad_norm": 0.45166343450546265,
+      "learning_rate": 4.0704887554077734e-05,
+      "loss": 0.07191664695739747,
+      "step": 10400
+    },
+    {
+      "epoch": 0.3754156387428939,
+      "grad_norm": 0.5005468726158142,
+      "learning_rate": 4.0615502878186565e-05,
+      "loss": 0.06205938339233399,
+      "step": 10500
+    },
+    {
+      "epoch": 0.37899102577854055,
+      "grad_norm": 0.6201260089874268,
+      "learning_rate": 4.0526118202295396e-05,
+      "loss": 0.061759543418884275,
+      "step": 10600
+    },
+    {
+      "epoch": 0.38256641281418713,
+      "grad_norm": 0.4341242015361786,
+      "learning_rate": 4.043673352640423e-05,
+      "loss": 0.06618201732635498,
+      "step": 10700
+    },
+    {
+      "epoch": 0.38614179984983377,
+      "grad_norm": 0.4003482460975647,
+      "learning_rate": 4.034734885051307e-05,
+      "loss": 0.06178065299987793,
+      "step": 10800
+    },
+    {
+      "epoch": 0.38971718688548035,
+      "grad_norm": 1.0296162366867065,
+      "learning_rate": 4.02579641746219e-05,
+      "loss": 0.06249929904937744,
+      "step": 10900
+    },
+    {
+      "epoch": 0.393292573921127,
+      "grad_norm": 1.362121820449829,
+      "learning_rate": 4.016857949873074e-05,
+      "loss": 0.05500233173370361,
+      "step": 11000
+    },
+    {
+      "epoch": 0.39686796095677357,
+      "grad_norm": 0.7699733376502991,
+      "learning_rate": 4.007919482283957e-05,
+      "loss": 0.060595006942749025,
+      "step": 11100
+    },
+    {
+      "epoch": 0.4004433479924202,
+      "grad_norm": 1.3927844762802124,
+      "learning_rate": 3.998981014694841e-05,
+      "loss": 0.05860278129577637,
+      "step": 11200
+    },
+    {
+      "epoch": 0.4040187350280668,
+      "grad_norm": 0.5842928290367126,
+      "learning_rate": 3.9900425471057245e-05,
+      "loss": 0.062330193519592285,
+      "step": 11300
+    },
+    {
+      "epoch": 0.40759412206371337,
+      "grad_norm": 1.231602430343628,
+      "learning_rate": 3.9811040795166076e-05,
+      "loss": 0.05743512153625488,
+      "step": 11400
+    },
+    {
+      "epoch": 0.41116950909936,
+      "grad_norm": 0.33235710859298706,
+      "learning_rate": 3.972165611927491e-05,
+      "loss": 0.059948296546936036,
+      "step": 11500
+    },
+    {
+      "epoch": 0.4147448961350066,
+      "grad_norm": 0.812560498714447,
+      "learning_rate": 3.963227144338375e-05,
+      "loss": 0.06013148784637451,
+      "step": 11600
+    },
+    {
+      "epoch": 0.41832028317065323,
+      "grad_norm": 0.7160065174102783,
+      "learning_rate": 3.954288676749258e-05,
+      "loss": 0.0654984951019287,
+      "step": 11700
+    },
+    {
+      "epoch": 0.4218956702062998,
+      "grad_norm": 0.959859311580658,
+      "learning_rate": 3.945350209160142e-05,
+      "loss": 0.061361746788024904,
+      "step": 11800
+    },
+    {
+      "epoch": 0.42547105724194645,
+      "grad_norm": 0.661882758140564,
+      "learning_rate": 3.936411741571025e-05,
+      "loss": 0.05800935268402099,
+      "step": 11900
+    },
+    {
+      "epoch": 0.42904644427759303,
+      "grad_norm": 1.3494808673858643,
+      "learning_rate": 3.927473273981909e-05,
+      "loss": 0.058743157386779786,
+      "step": 12000
+    },
+    {
+      "epoch": 0.43262183131323967,
+      "grad_norm": 0.3964793384075165,
+      "learning_rate": 3.9185348063927925e-05,
+      "loss": 0.05860978603363037,
+      "step": 12100
+    },
+    {
+      "epoch": 0.43619721834888625,
+      "grad_norm": 0.6984548568725586,
+      "learning_rate": 3.9095963388036756e-05,
+      "loss": 0.05355045795440674,
+      "step": 12200
+    },
+    {
+      "epoch": 0.4397726053845329,
+      "grad_norm": 0.9193189144134521,
+      "learning_rate": 3.900657871214559e-05,
+      "loss": 0.05985400676727295,
+      "step": 12300
+    },
+    {
+      "epoch": 0.44334799242017947,
+      "grad_norm": 2.1851706504821777,
+      "learning_rate": 3.891719403625443e-05,
+      "loss": 0.06027592182159424,
+      "step": 12400
+    },
+    {
+      "epoch": 0.4469233794558261,
+      "grad_norm": 2.280050754547119,
+      "learning_rate": 3.882780936036326e-05,
+      "loss": 0.05881267070770264,
+      "step": 12500
+    },
+    {
+      "epoch": 0.4469233794558261,
+      "eval_accuracy": 0.9822486078551317,
+      "eval_f1": 0.8582031250000001,
+      "eval_loss": 0.07316970080137253,
+      "eval_precision": 0.8336179093151205,
+      "eval_recall": 0.884282551821292,
+      "eval_runtime": 27.7811,
+      "eval_samples_per_second": 809.904,
+      "eval_steps_per_second": 22.497,
+      "step": 12500
+    },
+    {
+      "epoch": 0.4504987664914727,
+      "grad_norm": 0.9513980746269226,
+      "learning_rate": 3.87384246844721e-05,
+      "loss": 0.05991718769073486,
+      "step": 12600
+    },
+    {
+      "epoch": 0.45407415352711933,
+      "grad_norm": 0.8513447046279907,
+      "learning_rate": 3.8649040008580937e-05,
+      "loss": 0.059436683654785154,
+      "step": 12700
+    },
+    {
+      "epoch": 0.4576495405627659,
+      "grad_norm": 3.6959080696105957,
+      "learning_rate": 3.855965533268977e-05,
+      "loss": 0.06146327018737793,
+      "step": 12800
+    },
+    {
+      "epoch": 0.46122492759841255,
+      "grad_norm": 0.4215289056301117,
+      "learning_rate": 3.8470270656798605e-05,
+      "loss": 0.051028499603271486,
+      "step": 12900
+    },
+    {
+      "epoch": 0.46480031463405913,
+      "grad_norm": 0.553249716758728,
+      "learning_rate": 3.8380885980907436e-05,
+      "loss": 0.05888910293579101,
+      "step": 13000
+    },
+    {
+      "epoch": 0.46837570166970577,
+      "grad_norm": 0.534638524055481,
+      "learning_rate": 3.8291501305016266e-05,
+      "loss": 0.056477956771850586,
+      "step": 13100
+    },
+    {
+      "epoch": 0.47195108870535235,
+      "grad_norm": 0.5859609842300415,
+      "learning_rate": 3.8202116629125104e-05,
+      "loss": 0.05791654109954834,
+      "step": 13200
+    },
+    {
+      "epoch": 0.475526475740999,
+      "grad_norm": 0.6610586047172546,
+      "learning_rate": 3.8112731953233935e-05,
+      "loss": 0.05362565040588379,
+      "step": 13300
+    },
+    {
+      "epoch": 0.47910186277664557,
+      "grad_norm": 0.6048291325569153,
+      "learning_rate": 3.802334727734277e-05,
+      "loss": 0.057788271903991696,
+      "step": 13400
+    },
+    {
+      "epoch": 0.4826772498122922,
+      "grad_norm": 0.7319697141647339,
+      "learning_rate": 3.793396260145161e-05,
+      "loss": 0.05477115631103516,
+      "step": 13500
+    },
+    {
+      "epoch": 0.4862526368479388,
+      "grad_norm": 0.5771811008453369,
+      "learning_rate": 3.784457792556044e-05,
+      "loss": 0.059410476684570314,
+      "step": 13600
+    },
+    {
+      "epoch": 0.4898280238835854,
+      "grad_norm": 1.9499260187149048,
+      "learning_rate": 3.775519324966928e-05,
+      "loss": 0.052494893074035646,
+      "step": 13700
+    },
+    {
+      "epoch": 0.493403410919232,
+      "grad_norm": 0.8795179128646851,
+      "learning_rate": 3.766580857377811e-05,
+      "loss": 0.05528387546539307,
+      "step": 13800
+    },
+    {
+      "epoch": 0.4969787979548786,
+      "grad_norm": 0.5892202258110046,
+      "learning_rate": 3.7576423897886947e-05,
+      "loss": 0.05600544452667236,
+      "step": 13900
+    },
+    {
+      "epoch": 0.5005541849905252,
+      "grad_norm": 0.6402941346168518,
+      "learning_rate": 3.7487039221995784e-05,
+      "loss": 0.05357628345489502,
+      "step": 14000
+    },
+    {
+      "epoch": 0.5041295720261718,
+      "grad_norm": 0.5255988836288452,
+      "learning_rate": 3.7397654546104615e-05,
+      "loss": 0.057103352546691896,
+      "step": 14100
+    },
+    {
+      "epoch": 0.5077049590618185,
+      "grad_norm": 0.8301808834075928,
+      "learning_rate": 3.730826987021345e-05,
+      "loss": 0.0532010555267334,
+      "step": 14200
+    },
+    {
+      "epoch": 0.5112803460974651,
+      "grad_norm": 0.6901052594184875,
+      "learning_rate": 3.721888519432229e-05,
+      "loss": 0.05516294002532959,
+      "step": 14300
+    },
+    {
+      "epoch": 0.5148557331331116,
+      "grad_norm": 0.9628658890724182,
+      "learning_rate": 3.712950051843112e-05,
+      "loss": 0.06214995384216308,
+      "step": 14400
+    },
+    {
+      "epoch": 0.5184311201687583,
+      "grad_norm": 1.3679792881011963,
+      "learning_rate": 3.704011584253996e-05,
+      "loss": 0.05541347503662109,
+      "step": 14500
+    },
+    {
+      "epoch": 0.5220065072044049,
+      "grad_norm": 0.23267334699630737,
+      "learning_rate": 3.695073116664879e-05,
+      "loss": 0.0589248275756836,
+      "step": 14600
+    },
+    {
+      "epoch": 0.5255818942400515,
+      "grad_norm": 0.6239579319953918,
+      "learning_rate": 3.6861346490757627e-05,
+      "loss": 0.053284521102905276,
+      "step": 14700
+    },
+    {
+      "epoch": 0.529157281275698,
+      "grad_norm": 0.7674051523208618,
+      "learning_rate": 3.6771961814866464e-05,
+      "loss": 0.05738714218139648,
+      "step": 14800
+    },
+    {
+      "epoch": 0.5327326683113447,
+      "grad_norm": 0.8594136834144592,
+      "learning_rate": 3.6682577138975295e-05,
+      "loss": 0.055074062347412106,
+      "step": 14900
+    },
+    {
+      "epoch": 0.5363080553469913,
+      "grad_norm": 1.3505005836486816,
+      "learning_rate": 3.659319246308413e-05,
+      "loss": 0.05417671680450439,
+      "step": 15000
+    },
+    {
+      "epoch": 0.5363080553469913,
+      "eval_accuracy": 0.9838496993745539,
+      "eval_f1": 0.8737497800457504,
+      "eval_loss": 0.06651480495929718,
+      "eval_precision": 0.8560178736432719,
+      "eval_recall": 0.8922318373918293,
+      "eval_runtime": 27.3392,
+      "eval_samples_per_second": 822.994,
+      "eval_steps_per_second": 22.861,
+      "step": 15000
+    },
+    {
+      "epoch": 0.539883442382638,
+      "grad_norm": 0.7868797779083252,
+      "learning_rate": 3.650380778719297e-05,
+      "loss": 0.060230064392089847,
+      "step": 15100
+    },
+    {
+      "epoch": 0.5434588294182845,
+      "grad_norm": 0.3154486119747162,
+      "learning_rate": 3.64144231113018e-05,
+      "loss": 0.05918198108673096,
+      "step": 15200
+    },
+    {
+      "epoch": 0.5470342164539311,
+      "grad_norm": 0.5093942284584045,
+      "learning_rate": 3.632503843541064e-05,
+      "loss": 0.05554147720336914,
+      "step": 15300
+    },
+    {
+      "epoch": 0.5506096034895778,
+      "grad_norm": 1.080651044845581,
+      "learning_rate": 3.623565375951947e-05,
+      "loss": 0.05167547702789307,
+      "step": 15400
+    },
+    {
+      "epoch": 0.5541849905252244,
+      "grad_norm": 1.2834564447402954,
+      "learning_rate": 3.614626908362831e-05,
+      "loss": 0.05269266128540039,
+      "step": 15500
+    },
+    {
+      "epoch": 0.5577603775608709,
+      "grad_norm": 0.9456666707992554,
+      "learning_rate": 3.605688440773714e-05,
+      "loss": 0.05228121280670166,
+      "step": 15600
+    },
+    {
+      "epoch": 0.5613357645965176,
+      "grad_norm": 1.931270718574524,
+      "learning_rate": 3.5967499731845975e-05,
+      "loss": 0.05532039642333984,
+      "step": 15700
+    },
+    {
+      "epoch": 0.5649111516321642,
+      "grad_norm": 1.9416167736053467,
+      "learning_rate": 3.5878115055954806e-05,
+      "loss": 0.05132888793945312,
+      "step": 15800
+    },
+    {
+      "epoch": 0.5684865386678107,
+      "grad_norm": 0.2992418110370636,
+      "learning_rate": 3.578873038006364e-05,
+      "loss": 0.05806799411773682,
+      "step": 15900
+    },
+    {
+      "epoch": 0.5720619257034574,
+      "grad_norm": 0.7173650860786438,
+      "learning_rate": 3.5699345704172474e-05,
+      "loss": 0.05833985805511475,
+      "step": 16000
+    },
+    {
+      "epoch": 0.575637312739104,
+      "grad_norm": 1.0283321142196655,
+      "learning_rate": 3.560996102828131e-05,
+      "loss": 0.05651096820831299,
+      "step": 16100
+    },
+    {
+      "epoch": 0.5792126997747507,
+      "grad_norm": 0.43172529339790344,
+      "learning_rate": 3.552057635239015e-05,
+      "loss": 0.05330658435821533,
+      "step": 16200
+    },
+    {
+      "epoch": 0.5827880868103972,
+      "grad_norm": 0.6333898901939392,
+      "learning_rate": 3.543119167649898e-05,
+      "loss": 0.053462224006652834,
+      "step": 16300
+    },
+    {
+      "epoch": 0.5863634738460438,
+      "grad_norm": 0.8817270994186401,
+      "learning_rate": 3.534180700060782e-05,
+      "loss": 0.05549070358276367,
+      "step": 16400
+    },
+    {
+      "epoch": 0.5899388608816905,
+      "grad_norm": 4.280094146728516,
+      "learning_rate": 3.525242232471665e-05,
+      "loss": 0.05985762119293213,
+      "step": 16500
+    },
+    {
+      "epoch": 0.5935142479173371,
+      "grad_norm": 0.62297523021698,
+      "learning_rate": 3.5163037648825486e-05,
+      "loss": 0.05666534423828125,
+      "step": 16600
+    },
+    {
+      "epoch": 0.5970896349529836,
+      "grad_norm": 0.29738688468933105,
+      "learning_rate": 3.507365297293432e-05,
+      "loss": 0.053336749076843264,
+      "step": 16700
+    },
+    {
+      "epoch": 0.6006650219886303,
+      "grad_norm": 1.139436960220337,
+      "learning_rate": 3.4984268297043154e-05,
+      "loss": 0.05532379150390625,
+      "step": 16800
+    },
+    {
+      "epoch": 0.6042404090242769,
+      "grad_norm": 0.37320244312286377,
+      "learning_rate": 3.489488362115199e-05,
+      "loss": 0.05435383796691894,
+      "step": 16900
+    },
+    {
+      "epoch": 0.6078157960599235,
+      "grad_norm": 0.5908817648887634,
+      "learning_rate": 3.480549894526083e-05,
+      "loss": 0.052842388153076174,
+      "step": 17000
+    },
+    {
+      "epoch": 0.6113911830955701,
+      "grad_norm": 0.4973529279232025,
+      "learning_rate": 3.471611426936966e-05,
+      "loss": 0.05500569343566895,
+      "step": 17100
+    },
+    {
+      "epoch": 0.6149665701312167,
+      "grad_norm": 1.438362717628479,
+      "learning_rate": 3.46267295934785e-05,
+      "loss": 0.04875383853912354,
+      "step": 17200
+    },
+    {
+      "epoch": 0.6185419571668633,
+      "grad_norm": 1.1460702419281006,
+      "learning_rate": 3.4537344917587335e-05,
+      "loss": 0.05489758968353271,
+      "step": 17300
+    },
+    {
+      "epoch": 0.62211734420251,
+      "grad_norm": 0.359030157327652,
+      "learning_rate": 3.4447960241696166e-05,
+      "loss": 0.0537039852142334,
+      "step": 17400
+    },
+    {
+      "epoch": 0.6256927312381565,
+      "grad_norm": 1.0160428285598755,
+      "learning_rate": 3.4358575565805e-05,
+      "loss": 0.05569758415222168,
+      "step": 17500
+    },
+    {
+      "epoch": 0.6256927312381565,
+      "eval_accuracy": 0.9845403147375635,
+      "eval_f1": 0.877477096546864,
+      "eval_loss": 0.0613168403506279,
+      "eval_precision": 0.8606879199270053,
+      "eval_recall": 0.8949343069890464,
+      "eval_runtime": 27.832,
+      "eval_samples_per_second": 808.423,
+      "eval_steps_per_second": 22.456,
+      "step": 17500
+    },
+    {
+      "epoch": 0.6292681182738031,
+      "grad_norm": 0.9637561440467834,
+      "learning_rate": 3.4269190889913834e-05,
+      "loss": 0.05049953460693359,
+      "step": 17600
+    },
+    {
+      "epoch": 0.6328435053094498,
+      "grad_norm": 0.4047839343547821,
+      "learning_rate": 3.417980621402267e-05,
+      "loss": 0.051105165481567384,
+      "step": 17700
+    },
+    {
+      "epoch": 0.6364188923450964,
+      "grad_norm": 0.5562448501586914,
+      "learning_rate": 3.409042153813151e-05,
+      "loss": 0.04887496471405029,
+      "step": 17800
+    },
+    {
+      "epoch": 0.6399942793807429,
+      "grad_norm": 0.7675971984863281,
+      "learning_rate": 3.400103686224034e-05,
+      "loss": 0.05429211139678955,
+      "step": 17900
+    },
+    {
+      "epoch": 0.6435696664163896,
+      "grad_norm": 0.44871142506599426,
+      "learning_rate": 3.391165218634918e-05,
+      "loss": 0.04755040645599365,
+      "step": 18000
+    },
+    {
+      "epoch": 0.6471450534520362,
+      "grad_norm": 0.4453502297401428,
+      "learning_rate": 3.382226751045801e-05,
+      "loss": 0.05987214088439941,
+      "step": 18100
+    },
+    {
+      "epoch": 0.6507204404876827,
+      "grad_norm": 0.4004403352737427,
+      "learning_rate": 3.373288283456684e-05,
+      "loss": 0.054094972610473635,
+      "step": 18200
+    },
+    {
+      "epoch": 0.6542958275233294,
+      "grad_norm": 0.8362923264503479,
+      "learning_rate": 3.364349815867568e-05,
+      "loss": 0.04843898296356201,
+      "step": 18300
+    },
+    {
+      "epoch": 0.657871214558976,
+      "grad_norm": 0.6269751787185669,
+      "learning_rate": 3.355411348278451e-05,
+      "loss": 0.05007925033569336,
+      "step": 18400
+    },
+    {
+      "epoch": 0.6614466015946227,
+      "grad_norm": 0.7181591987609863,
+      "learning_rate": 3.3464728806893345e-05,
+      "loss": 0.05742511749267578,
+      "step": 18500
+    },
+    {
+      "epoch": 0.6650219886302692,
+      "grad_norm": 2.8255951404571533,
+      "learning_rate": 3.337534413100218e-05,
+      "loss": 0.050363807678222655,
+      "step": 18600
+    },
+    {
+      "epoch": 0.6685973756659158,
+      "grad_norm": 1.1854428052902222,
+      "learning_rate": 3.328595945511101e-05,
+      "loss": 0.05580689430236816,
+      "step": 18700
+    },
+    {
+      "epoch": 0.6721727627015625,
+      "grad_norm": 0.3564029335975647,
+      "learning_rate": 3.319657477921985e-05,
+      "loss": 0.04986191749572754,
+      "step": 18800
+    },
+    {
+      "epoch": 0.6757481497372091,
+      "grad_norm": 0.9392517805099487,
+      "learning_rate": 3.310719010332869e-05,
+      "loss": 0.05029686450958252,
+      "step": 18900
+    },
+    {
+      "epoch": 0.6793235367728556,
+      "grad_norm": 0.9811071157455444,
+      "learning_rate": 3.301780542743752e-05,
+      "loss": 0.05468404293060303,
+      "step": 19000
+    },
+    {
+      "epoch": 0.6828989238085023,
+      "grad_norm": 2.1979386806488037,
+      "learning_rate": 3.292842075154636e-05,
+      "loss": 0.04795463562011719,
+      "step": 19100
+    },
+    {
+      "epoch": 0.6864743108441489,
+      "grad_norm": 4.135185241699219,
+      "learning_rate": 3.2839036075655194e-05,
+      "loss": 0.051746668815612795,
+      "step": 19200
+    },
+    {
+      "epoch": 0.6900496978797955,
+      "grad_norm": 0.611629843711853,
+      "learning_rate": 3.2749651399764025e-05,
+      "loss": 0.05136622428894043,
+      "step": 19300
+    },
+    {
+      "epoch": 0.6936250849154421,
+      "grad_norm": 0.7905089259147644,
+      "learning_rate": 3.266026672387286e-05,
+      "loss": 0.0534757661819458,
+      "step": 19400
+    },
+    {
+      "epoch": 0.6972004719510887,
+      "grad_norm": 0.3704472482204437,
+      "learning_rate": 3.257088204798169e-05,
+      "loss": 0.05190816879272461,
+      "step": 19500
+    },
+    {
+      "epoch": 0.7007758589867353,
+      "grad_norm": 0.41257503628730774,
+      "learning_rate": 3.248149737209053e-05,
+      "loss": 0.05314404487609863,
+      "step": 19600
+    },
+    {
+      "epoch": 0.704351246022382,
+      "grad_norm": 1.0130038261413574,
+      "learning_rate": 3.239211269619937e-05,
+      "loss": 0.051221070289611814,
+      "step": 19700
+    },
+    {
+      "epoch": 0.7079266330580285,
+      "grad_norm": 0.44306495785713196,
+      "learning_rate": 3.23027280203082e-05,
+      "loss": 0.05151443004608154,
+      "step": 19800
+    },
+    {
+      "epoch": 0.7115020200936751,
+      "grad_norm": 1.3375622034072876,
+      "learning_rate": 3.221334334441704e-05,
+      "loss": 0.051753206253051756,
+      "step": 19900
+    },
+    {
+      "epoch": 0.7150774071293218,
+      "grad_norm": 0.48512154817581177,
+      "learning_rate": 3.2123958668525874e-05,
+      "loss": 0.04863485813140869,
+      "step": 20000
+    },
+    {
+      "epoch": 0.7150774071293218,
+      "eval_accuracy": 0.9851494027728759,
+      "eval_f1": 0.8755350929603205,
+      "eval_loss": 0.05904531106352806,
+      "eval_precision": 0.856669280182671,
+      "eval_recall": 0.8952505534312739,
+      "eval_runtime": 27.4851,
+      "eval_samples_per_second": 818.624,
+      "eval_steps_per_second": 22.74,
+      "step": 20000
+    },
+    {
+      "epoch": 0.7186527941649684,
+      "grad_norm": 0.45322614908218384,
+      "learning_rate": 3.2034573992634705e-05,
+      "loss": 0.05499778270721436,
+      "step": 20100
+    },
+    {
+      "epoch": 0.7222281812006149,
+      "grad_norm": 0.4665698707103729,
+      "learning_rate": 3.194518931674354e-05,
+      "loss": 0.05100120544433594,
+      "step": 20200
+    },
+    {
+      "epoch": 0.7258035682362616,
+      "grad_norm": 0.7074053883552551,
+      "learning_rate": 3.185580464085237e-05,
+      "loss": 0.04919565200805664,
+      "step": 20300
+    },
+    {
+      "epoch": 0.7293789552719082,
+      "grad_norm": 1.2581121921539307,
+      "learning_rate": 3.176641996496121e-05,
+      "loss": 0.05387771606445312,
+      "step": 20400
+    },
+    {
+      "epoch": 0.7329543423075547,
+      "grad_norm": 0.3161942660808563,
+      "learning_rate": 3.167703528907004e-05,
+      "loss": 0.04680909633636474,
+      "step": 20500
+    },
+    {
+      "epoch": 0.7365297293432014,
+      "grad_norm": 0.8641468286514282,
+      "learning_rate": 3.158765061317888e-05,
+      "loss": 0.04961400508880615,
+      "step": 20600
+    },
+    {
+      "epoch": 0.740105116378848,
+      "grad_norm": 0.6563690304756165,
+      "learning_rate": 3.149826593728771e-05,
+      "loss": 0.05145148754119873,
+      "step": 20700
+    },
+    {
+      "epoch": 0.7436805034144947,
+      "grad_norm": 0.3394390940666199,
+      "learning_rate": 3.140888126139655e-05,
+      "loss": 0.048502054214477536,
+      "step": 20800
+    },
+    {
+      "epoch": 0.7472558904501412,
+      "grad_norm": 0.5382287502288818,
+      "learning_rate": 3.131949658550538e-05,
+      "loss": 0.052634720802307126,
+      "step": 20900
+    },
+    {
+      "epoch": 0.7508312774857878,
+      "grad_norm": 0.5506078004837036,
+      "learning_rate": 3.1230111909614216e-05,
+      "loss": 0.05615939140319824,
+      "step": 21000
+    },
+    {
+      "epoch": 0.7544066645214345,
+      "grad_norm": 0.4533487558364868,
+      "learning_rate": 3.114072723372305e-05,
+      "loss": 0.0571517276763916,
+      "step": 21100
+    },
+    {
+      "epoch": 0.7579820515570811,
+      "grad_norm": 1.2659982442855835,
+      "learning_rate": 3.1051342557831884e-05,
+      "loss": 0.05127411842346191,
+      "step": 21200
+    },
+    {
+      "epoch": 0.7615574385927276,
+      "grad_norm": 0.38378211855888367,
+      "learning_rate": 3.096195788194072e-05,
+      "loss": 0.04847681522369385,
+      "step": 21300
+    },
+    {
+      "epoch": 0.7651328256283743,
+      "grad_norm": 0.2992658317089081,
+      "learning_rate": 3.087257320604955e-05,
+      "loss": 0.05205928325653076,
+      "step": 21400
+    },
+    {
+      "epoch": 0.7687082126640209,
+      "grad_norm": 0.5818284749984741,
+      "learning_rate": 3.078318853015839e-05,
+      "loss": 0.04922466278076172,
+      "step": 21500
+    },
+    {
+      "epoch": 0.7722835996996675,
+      "grad_norm": 0.41028082370758057,
+      "learning_rate": 3.069380385426723e-05,
+      "loss": 0.04695847034454346,
+      "step": 21600
+    },
+    {
+      "epoch": 0.7758589867353141,
+      "grad_norm": 0.31596678495407104,
+      "learning_rate": 3.060441917837606e-05,
+      "loss": 0.049401440620422364,
+      "step": 21700
+    },
+    {
+      "epoch": 0.7794343737709607,
+      "grad_norm": 0.39899763464927673,
+      "learning_rate": 3.0515034502484896e-05,
+      "loss": 0.0458904504776001,
+      "step": 21800
+    },
+    {
+      "epoch": 0.7830097608066073,
+      "grad_norm": 4.016449928283691,
+      "learning_rate": 3.0425649826593733e-05,
+      "loss": 0.04808720588684082,
+      "step": 21900
+    },
+    {
+      "epoch": 0.786585147842254,
+      "grad_norm": 1.8184044361114502,
+      "learning_rate": 3.0336265150702564e-05,
+      "loss": 0.050203371047973636,
+      "step": 22000
+    },
+    {
+      "epoch": 0.7901605348779005,
+      "grad_norm": 0.47340500354766846,
+      "learning_rate": 3.0246880474811402e-05,
+      "loss": 0.04804760932922363,
+      "step": 22100
+    },
+    {
+      "epoch": 0.7937359219135471,
+      "grad_norm": 1.306254506111145,
+      "learning_rate": 3.0157495798920233e-05,
+      "loss": 0.04765232563018799,
+      "step": 22200
+    },
+    {
+      "epoch": 0.7973113089491938,
+      "grad_norm": 0.6133173704147339,
+      "learning_rate": 3.006811112302907e-05,
+      "loss": 0.04909511566162109,
+      "step": 22300
+    },
+    {
+      "epoch": 0.8008866959848404,
+      "grad_norm": 1.063022494316101,
+      "learning_rate": 2.9978726447137904e-05,
+      "loss": 0.048132557868957516,
+      "step": 22400
+    },
+    {
+      "epoch": 0.8044620830204869,
+      "grad_norm": 0.4442903697490692,
+      "learning_rate": 2.988934177124674e-05,
+      "loss": 0.04739914894104004,
+      "step": 22500
+    },
+    {
+      "epoch": 0.8044620830204869,
+      "eval_accuracy": 0.9853803092042249,
+      "eval_f1": 0.8812850838481906,
+      "eval_loss": 0.060121480375528336,
+      "eval_precision": 0.8660403280645027,
+      "eval_recall": 0.8970761578932237,
+      "eval_runtime": 27.7438,
+      "eval_samples_per_second": 810.991,
+      "eval_steps_per_second": 22.528,
+      "step": 22500
+    },
+    {
+      "epoch": 0.8080374700561336,
+      "grad_norm": 0.8813098073005676,
+      "learning_rate": 2.9799957095355573e-05,
+      "loss": 0.05161878108978271,
+      "step": 22600
+    },
+    {
+      "epoch": 0.8116128570917802,
+      "grad_norm": 0.7460477948188782,
+      "learning_rate": 2.971057241946441e-05,
+      "loss": 0.0515793514251709,
+      "step": 22700
+    },
+    {
+      "epoch": 0.8151882441274267,
+      "grad_norm": 0.5062021613121033,
+      "learning_rate": 2.962118774357324e-05,
+      "loss": 0.04754622936248779,
+      "step": 22800
+    },
+    {
+      "epoch": 0.8187636311630734,
+      "grad_norm": 0.7567230463027954,
+      "learning_rate": 2.953180306768208e-05,
+      "loss": 0.05149875164031983,
+      "step": 22900
+    },
+    {
+      "epoch": 0.82233901819872,
+      "grad_norm": 0.7439789772033691,
+      "learning_rate": 2.944241839179091e-05,
+      "loss": 0.04974982738494873,
+      "step": 23000
+    },
+    {
+      "epoch": 0.8259144052343667,
+      "grad_norm": 0.669979453086853,
+      "learning_rate": 2.9353033715899747e-05,
+      "loss": 0.04604334354400635,
+      "step": 23100
+    },
+    {
+      "epoch": 0.8294897922700132,
+      "grad_norm": 1.005071759223938,
+      "learning_rate": 2.9263649040008584e-05,
+      "loss": 0.04706980228424072,
+      "step": 23200
+    },
+    {
+      "epoch": 0.8330651793056598,
+      "grad_norm": 0.31772536039352417,
+      "learning_rate": 2.9174264364117415e-05,
+      "loss": 0.05056349754333496,
+      "step": 23300
+    },
+    {
+      "epoch": 0.8366405663413065,
+      "grad_norm": 0.32514145970344543,
+      "learning_rate": 2.9084879688226253e-05,
+      "loss": 0.04744285106658935,
+      "step": 23400
+    },
+    {
+      "epoch": 0.8402159533769531,
+      "grad_norm": 1.0965938568115234,
+      "learning_rate": 2.899549501233509e-05,
+      "loss": 0.04618396759033203,
+      "step": 23500
+    },
+    {
+      "epoch": 0.8437913404125996,
+      "grad_norm": 0.6312568783760071,
+      "learning_rate": 2.890611033644392e-05,
+      "loss": 0.04719692230224609,
+      "step": 23600
+    },
+    {
+      "epoch": 0.8473667274482463,
+      "grad_norm": 0.5469244122505188,
+      "learning_rate": 2.881672566055276e-05,
+      "loss": 0.04657519817352295,
+      "step": 23700
+    },
+    {
+      "epoch": 0.8509421144838929,
+      "grad_norm": 0.9338961839675903,
+      "learning_rate": 2.8727340984661593e-05,
+      "loss": 0.04994749069213867,
+      "step": 23800
+    },
+    {
+      "epoch": 0.8545175015195395,
+      "grad_norm": 0.6873934268951416,
+      "learning_rate": 2.8637956308770423e-05,
+      "loss": 0.04766389846801758,
+      "step": 23900
+    },
+    {
+      "epoch": 0.8580928885551861,
+      "grad_norm": 1.3465129137039185,
+      "learning_rate": 2.854857163287926e-05,
+      "loss": 0.04612489223480225,
+      "step": 24000
+    },
+    {
+      "epoch": 0.8616682755908327,
+      "grad_norm": 0.3835633397102356,
+      "learning_rate": 2.8459186956988092e-05,
+      "loss": 0.048950729370117185,
+      "step": 24100
+    },
+    {
+      "epoch": 0.8652436626264793,
+      "grad_norm": 0.7884401082992554,
+      "learning_rate": 2.836980228109693e-05,
+      "loss": 0.046166911125183105,
+      "step": 24200
+    },
+    {
+      "epoch": 0.868819049662126,
+      "grad_norm": 0.49389323592185974,
+      "learning_rate": 2.8280417605205767e-05,
+      "loss": 0.046818752288818356,
+      "step": 24300
+    },
+    {
+      "epoch": 0.8723944366977725,
+      "grad_norm": 0.6339199542999268,
+      "learning_rate": 2.8191032929314598e-05,
+      "loss": 0.04933880805969238,
+      "step": 24400
+    },
+    {
+      "epoch": 0.8759698237334191,
+      "grad_norm": 0.5761122703552246,
+      "learning_rate": 2.8101648253423435e-05,
+      "loss": 0.044534187316894534,
+      "step": 24500
+    },
+    {
+      "epoch": 0.8795452107690658,
+      "grad_norm": 0.45685720443725586,
+      "learning_rate": 2.8012263577532273e-05,
+      "loss": 0.051560683250427244,
+      "step": 24600
+    },
+    {
+      "epoch": 0.8831205978047124,
+      "grad_norm": 0.4419282078742981,
+      "learning_rate": 2.7922878901641104e-05,
+      "loss": 0.043671913146972656,
+      "step": 24700
+    },
+    {
+      "epoch": 0.8866959848403589,
+      "grad_norm": 0.734449028968811,
+      "learning_rate": 2.783349422574994e-05,
+      "loss": 0.05153060913085938,
+      "step": 24800
+    },
+    {
+      "epoch": 0.8902713718760056,
+      "grad_norm": 1.0401020050048828,
+      "learning_rate": 2.7744109549858772e-05,
+      "loss": 0.04694102287292481,
+      "step": 24900
+    },
+    {
+      "epoch": 0.8938467589116522,
+      "grad_norm": 0.646715521812439,
+      "learning_rate": 2.765472487396761e-05,
+      "loss": 0.054542098045349124,
+      "step": 25000
+    },
+    {
+      "epoch": 0.8938467589116522,
+      "eval_accuracy": 0.9856964248425865,
+      "eval_f1": 0.8835749303424683,
+      "eval_loss": 0.05743265897035599,
+      "eval_precision": 0.8674635382761534,
+      "eval_recall": 0.9002961216686313,
+      "eval_runtime": 27.4328,
+      "eval_samples_per_second": 820.186,
+      "eval_steps_per_second": 22.783,
+      "step": 25000
+    },
+    {
+      "epoch": 0.8974221459472987,
+      "grad_norm": 0.3341001570224762,
+      "learning_rate": 2.7565340198076444e-05,
+      "loss": 0.04484391689300537,
+      "step": 25100
+    },
+    {
+      "epoch": 0.9009975329829454,
+      "grad_norm": 0.700167715549469,
+      "learning_rate": 2.7475955522185278e-05,
+      "loss": 0.04423677921295166,
+      "step": 25200
+    },
+    {
+      "epoch": 0.904572920018592,
+      "grad_norm": 1.2379734516143799,
+      "learning_rate": 2.7386570846294112e-05,
+      "loss": 0.04488907337188721,
+      "step": 25300
+    },
+    {
+      "epoch": 0.9081483070542387,
+      "grad_norm": 0.4145027697086334,
+      "learning_rate": 2.729718617040295e-05,
+      "loss": 0.04520434856414795,
+      "step": 25400
+    },
+    {
+      "epoch": 0.9117236940898852,
+      "grad_norm": 0.3579607605934143,
+      "learning_rate": 2.720780149451178e-05,
+      "loss": 0.04551751613616943,
+      "step": 25500
+    },
+    {
+      "epoch": 0.9152990811255318,
+      "grad_norm": 0.5503469705581665,
+      "learning_rate": 2.7118416818620618e-05,
+      "loss": 0.04752420425415039,
+      "step": 25600
+    },
+    {
+      "epoch": 0.9188744681611785,
+      "grad_norm": 0.41558948159217834,
+      "learning_rate": 2.702903214272945e-05,
+      "loss": 0.05269415855407715,
+      "step": 25700
+    },
+    {
+      "epoch": 0.9224498551968251,
+      "grad_norm": 1.5605533123016357,
+      "learning_rate": 2.6939647466838286e-05,
+      "loss": 0.0499528169631958,
+      "step": 25800
+    },
+    {
+      "epoch": 0.9260252422324716,
+      "grad_norm": 0.6252946853637695,
+      "learning_rate": 2.6850262790947124e-05,
+      "loss": 0.04681193351745606,
+      "step": 25900
+    },
+    {
+      "epoch": 0.9296006292681183,
+      "grad_norm": 0.4643714427947998,
+      "learning_rate": 2.6760878115055954e-05,
+      "loss": 0.04491585254669189,
+      "step": 26000
+    },
+    {
+      "epoch": 0.9331760163037649,
+      "grad_norm": 1.0552211999893188,
+      "learning_rate": 2.6671493439164792e-05,
+      "loss": 0.050134167671203614,
+      "step": 26100
+    },
+    {
+      "epoch": 0.9367514033394115,
+      "grad_norm": 0.2919712960720062,
+      "learning_rate": 2.6582108763273626e-05,
+      "loss": 0.045297045707702634,
+      "step": 26200
+    },
+    {
+      "epoch": 0.9403267903750581,
+      "grad_norm": 0.5062688589096069,
+      "learning_rate": 2.649272408738246e-05,
+      "loss": 0.04247344017028808,
+      "step": 26300
+    },
+    {
+      "epoch": 0.9439021774107047,
+      "grad_norm": 0.4406910538673401,
+      "learning_rate": 2.6403339411491294e-05,
+      "loss": 0.0437799072265625,
+      "step": 26400
+    },
+    {
+      "epoch": 0.9474775644463513,
+      "grad_norm": 0.41486886143684387,
+      "learning_rate": 2.6313954735600132e-05,
+      "loss": 0.04669870376586914,
+      "step": 26500
+    },
+    {
+      "epoch": 0.951052951481998,
+      "grad_norm": 0.6877465844154358,
+      "learning_rate": 2.6224570059708963e-05,
+      "loss": 0.04583415985107422,
+      "step": 26600
+    },
+    {
+      "epoch": 0.9546283385176445,
+      "grad_norm": 0.6501809358596802,
+      "learning_rate": 2.61351853838178e-05,
+      "loss": 0.04593777179718018,
+      "step": 26700
+    },
+    {
+      "epoch": 0.9582037255532911,
+      "grad_norm": 0.7312682271003723,
+      "learning_rate": 2.604580070792663e-05,
+      "loss": 0.050377216339111325,
+      "step": 26800
+    },
+    {
+      "epoch": 0.9617791125889378,
+      "grad_norm": 0.8844775557518005,
+      "learning_rate": 2.595641603203547e-05,
+      "loss": 0.04860093593597412,
+      "step": 26900
+    },
+    {
+      "epoch": 0.9653544996245844,
+      "grad_norm": 0.4647756814956665,
+      "learning_rate": 2.5867031356144306e-05,
+      "loss": 0.0445063066482544,
+      "step": 27000
+    },
+    {
+      "epoch": 0.9689298866602309,
+      "grad_norm": 0.20223687589168549,
+      "learning_rate": 2.5777646680253137e-05,
+      "loss": 0.04691956520080567,
+      "step": 27100
+    },
+    {
+      "epoch": 0.9725052736958776,
+      "grad_norm": 0.9210941195487976,
+      "learning_rate": 2.5688262004361974e-05,
+      "loss": 0.049297604560852054,
+      "step": 27200
+    },
+    {
+      "epoch": 0.9760806607315242,
+      "grad_norm": 0.35992079973220825,
+      "learning_rate": 2.5598877328470812e-05,
+      "loss": 0.04701284408569336,
+      "step": 27300
+    },
+    {
+      "epoch": 0.9796560477671707,
+      "grad_norm": 0.6507813334465027,
+      "learning_rate": 2.5509492652579643e-05,
+      "loss": 0.04716668605804444,
+      "step": 27400
+    },
+    {
+      "epoch": 0.9832314348028174,
+      "grad_norm": 0.5909741520881653,
+      "learning_rate": 2.542010797668848e-05,
+      "loss": 0.048493666648864744,
+      "step": 27500
+    },
+    {
+      "epoch": 0.9832314348028174,
+      "eval_accuracy": 0.9858305504462175,
+      "eval_f1": 0.8868203247033212,
+      "eval_loss": 0.05660928413271904,
+      "eval_precision": 0.8723250413671315,
+      "eval_recall": 0.9018054796883535,
+      "eval_runtime": 27.7774,
+      "eval_samples_per_second": 810.012,
+      "eval_steps_per_second": 22.5,
+      "step": 27500
+    },
+    {
+      "epoch": 0.986806821838464,
+      "grad_norm": 0.47291576862335205,
+      "learning_rate": 2.533072330079731e-05,
+      "loss": 0.04355491161346436,
+      "step": 27600
+    },
+    {
+      "epoch": 0.9903822088741107,
+      "grad_norm": 0.4872467815876007,
+      "learning_rate": 2.5241338624906145e-05,
+      "loss": 0.0435347318649292,
+      "step": 27700
+    },
+    {
+      "epoch": 0.9939575959097572,
+      "grad_norm": 1.711300015449524,
+      "learning_rate": 2.5151953949014983e-05,
+      "loss": 0.04561484336853027,
+      "step": 27800
+    },
+    {
+      "epoch": 0.9975329829454038,
+      "grad_norm": 0.2917760908603668,
+      "learning_rate": 2.5062569273123814e-05,
+      "loss": 0.047463297843933105,
+      "step": 27900
+    },
+    {
+      "epoch": 1.0011083699810504,
+      "grad_norm": 0.2678261697292328,
+      "learning_rate": 2.497318459723265e-05,
+      "loss": 0.04366901874542237,
+      "step": 28000
+    },
+    {
+      "epoch": 1.004683757016697,
+      "grad_norm": 0.3751468062400818,
+      "learning_rate": 2.4883799921341485e-05,
+      "loss": 0.03846597194671631,
+      "step": 28100
+    },
+    {
+      "epoch": 1.0082591440523436,
+      "grad_norm": 0.41662493348121643,
+      "learning_rate": 2.4794415245450323e-05,
+      "loss": 0.03653419733047485,
+      "step": 28200
+    },
+    {
+      "epoch": 1.0118345310879904,
+      "grad_norm": 0.6062248945236206,
+      "learning_rate": 2.4705030569559157e-05,
+      "loss": 0.037252871990203856,
+      "step": 28300
+    },
+    {
+      "epoch": 1.015409918123637,
+      "grad_norm": 0.7458221316337585,
+      "learning_rate": 2.461564589366799e-05,
+      "loss": 0.03445641756057739,
+      "step": 28400
+    },
+    {
+      "epoch": 1.0189853051592834,
+      "grad_norm": 0.13679973781108856,
+      "learning_rate": 2.4526261217776825e-05,
+      "loss": 0.03599729061126709,
+      "step": 28500
+    },
+    {
+      "epoch": 1.0225606921949302,
+      "grad_norm": 1.258949637413025,
+      "learning_rate": 2.4436876541885663e-05,
+      "loss": 0.037976634502410886,
+      "step": 28600
+    },
+    {
+      "epoch": 1.0261360792305767,
+      "grad_norm": 0.27776288986206055,
+      "learning_rate": 2.4347491865994497e-05,
+      "loss": 0.03968371391296387,
+      "step": 28700
+    },
+    {
+      "epoch": 1.0297114662662232,
+      "grad_norm": 0.34287697076797485,
+      "learning_rate": 2.425810719010333e-05,
+      "loss": 0.03572561502456665,
+      "step": 28800
+    },
+    {
+      "epoch": 1.03328685330187,
+      "grad_norm": 0.5158637166023254,
+      "learning_rate": 2.4168722514212165e-05,
+      "loss": 0.036703295707702636,
+      "step": 28900
+    },
+    {
+      "epoch": 1.0368622403375165,
+      "grad_norm": 0.8635151982307434,
+      "learning_rate": 2.4079337838321e-05,
+      "loss": 0.035954997539520264,
+      "step": 29000
+    },
+    {
+      "epoch": 1.040437627373163,
+      "grad_norm": 0.6386840343475342,
+      "learning_rate": 2.3989953162429834e-05,
+      "loss": 0.039990205764770505,
+      "step": 29100
+    },
+    {
+      "epoch": 1.0440130144088098,
+      "grad_norm": 0.2795710861682892,
+      "learning_rate": 2.3900568486538668e-05,
+      "loss": 0.03744415760040283,
+      "step": 29200
+    },
+    {
+      "epoch": 1.0475884014444563,
+      "grad_norm": 0.674773097038269,
+      "learning_rate": 2.3811183810647502e-05,
+      "loss": 0.038765432834625246,
+      "step": 29300
+    },
+    {
+      "epoch": 1.051163788480103,
+      "grad_norm": 0.5345519185066223,
+      "learning_rate": 2.372179913475634e-05,
+      "loss": 0.03793670177459717,
+      "step": 29400
+    },
+    {
+      "epoch": 1.0547391755157496,
+      "grad_norm": 0.19475312530994415,
+      "learning_rate": 2.3632414458865174e-05,
+      "loss": 0.03510812759399414,
+      "step": 29500
+    },
+    {
+      "epoch": 1.058314562551396,
+      "grad_norm": 0.6469267010688782,
+      "learning_rate": 2.3543029782974008e-05,
+      "loss": 0.03977480411529541,
+      "step": 29600
+    },
+    {
+      "epoch": 1.0618899495870429,
+      "grad_norm": 0.3818305432796478,
+      "learning_rate": 2.3453645107082842e-05,
+      "loss": 0.03915615558624268,
+      "step": 29700
+    },
+    {
+      "epoch": 1.0654653366226894,
+      "grad_norm": 0.7031393051147461,
+      "learning_rate": 2.336426043119168e-05,
+      "loss": 0.03701666355133057,
+      "step": 29800
+    },
+    {
+      "epoch": 1.069040723658336,
+      "grad_norm": 0.34952452778816223,
+      "learning_rate": 2.3274875755300514e-05,
+      "loss": 0.03564514398574829,
+      "step": 29900
+    },
+    {
+      "epoch": 1.0726161106939827,
+      "grad_norm": 0.5351042747497559,
+      "learning_rate": 2.3185491079409348e-05,
+      "loss": 0.04400619983673096,
+      "step": 30000
+    },
+    {
+      "epoch": 1.0726161106939827,
+      "eval_accuracy": 0.9867462864302234,
+      "eval_f1": 0.8903530810550676,
+      "eval_loss": 0.05216454714536667,
+      "eval_precision": 0.8769046324564705,
+      "eval_recall": 0.9042204525199091,
+      "eval_runtime": 27.3869,
+      "eval_samples_per_second": 821.559,
+      "eval_steps_per_second": 22.821,
+      "step": 30000
+    },
+    {
+      "epoch": 1.0761914977296292,
+      "grad_norm": 0.6395847201347351,
+      "learning_rate": 2.3096106403518182e-05,
+      "loss": 0.03795994281768799,
+      "step": 30100
+    },
+    {
+      "epoch": 1.079766884765276,
+      "grad_norm": 0.2738804221153259,
+      "learning_rate": 2.3006721727627016e-05,
+      "loss": 0.034112286567687986,
+      "step": 30200
+    },
+    {
+      "epoch": 1.0833422718009225,
+      "grad_norm": 0.36416754126548767,
+      "learning_rate": 2.291733705173585e-05,
+      "loss": 0.03839835166931152,
+      "step": 30300
+    },
+    {
+      "epoch": 1.086917658836569,
+      "grad_norm": 0.8902291059494019,
+      "learning_rate": 2.2827952375844684e-05,
+      "loss": 0.04109617233276367,
+      "step": 30400
+    },
+    {
+      "epoch": 1.0904930458722157,
+      "grad_norm": 0.47186803817749023,
+      "learning_rate": 2.2738567699953522e-05,
+      "loss": 0.03920984029769897,
+      "step": 30500
+    },
+    {
+      "epoch": 1.0940684329078623,
+      "grad_norm": 3.810819625854492,
+      "learning_rate": 2.2649183024062356e-05,
+      "loss": 0.0391163420677185,
+      "step": 30600
+    },
+    {
+      "epoch": 1.0976438199435088,
+      "grad_norm": 0.8752216696739197,
+      "learning_rate": 2.255979834817119e-05,
+      "loss": 0.038404548168182374,
+      "step": 30700
+    },
+    {
+      "epoch": 1.1012192069791555,
+      "grad_norm": 0.2776939570903778,
+      "learning_rate": 2.2470413672280025e-05,
+      "loss": 0.037470765113830566,
+      "step": 30800
+    },
+    {
+      "epoch": 1.104794594014802,
+      "grad_norm": 0.549679160118103,
+      "learning_rate": 2.2381028996388862e-05,
+      "loss": 0.03804266691207886,
+      "step": 30900
+    },
+    {
+      "epoch": 1.1083699810504486,
+      "grad_norm": 0.7605739235877991,
+      "learning_rate": 2.2291644320497696e-05,
+      "loss": 0.03416654348373413,
+      "step": 31000
+    },
+    {
+      "epoch": 1.1119453680860953,
+      "grad_norm": 0.16704197227954865,
+      "learning_rate": 2.220225964460653e-05,
+      "loss": 0.034537038803100585,
+      "step": 31100
+    },
+    {
+      "epoch": 1.1155207551217419,
+      "grad_norm": 0.5772648453712463,
+      "learning_rate": 2.2112874968715365e-05,
+      "loss": 0.03786729097366333,
+      "step": 31200
+    },
+    {
+      "epoch": 1.1190961421573886,
+      "grad_norm": 0.3576936423778534,
+      "learning_rate": 2.2023490292824202e-05,
+      "loss": 0.04146803379058838,
+      "step": 31300
+    },
+    {
+      "epoch": 1.1226715291930351,
+      "grad_norm": 0.24434928596019745,
+      "learning_rate": 2.1934105616933033e-05,
+      "loss": 0.03837924718856812,
+      "step": 31400
+    },
+    {
+      "epoch": 1.1262469162286817,
+      "grad_norm": 0.8151653409004211,
+      "learning_rate": 2.1844720941041867e-05,
+      "loss": 0.03402991771697998,
+      "step": 31500
+    },
+    {
+      "epoch": 1.1298223032643284,
+      "grad_norm": 0.803303062915802,
+      "learning_rate": 2.17553362651507e-05,
+      "loss": 0.03701550483703613,
+      "step": 31600
+    },
+    {
+      "epoch": 1.133397690299975,
+      "grad_norm": 0.5276838541030884,
+      "learning_rate": 2.166595158925954e-05,
+      "loss": 0.037687735557556154,
+      "step": 31700
+    },
+    {
+      "epoch": 1.1369730773356217,
+      "grad_norm": 1.2563331127166748,
+      "learning_rate": 2.1576566913368373e-05,
+      "loss": 0.04105483055114746,
+      "step": 31800
+    },
+    {
+      "epoch": 1.1405484643712682,
+      "grad_norm": 2.2794508934020996,
+      "learning_rate": 2.1487182237477207e-05,
+      "loss": 0.03871995687484741,
+      "step": 31900
+    },
+    {
+      "epoch": 1.1441238514069147,
+      "grad_norm": 0.5270197987556458,
+      "learning_rate": 2.139779756158604e-05,
+      "loss": 0.03748847007751465,
+      "step": 32000
+    },
+    {
+      "epoch": 1.1476992384425615,
+      "grad_norm": 0.4776967763900757,
+      "learning_rate": 2.130841288569488e-05,
+      "loss": 0.04054388523101807,
+      "step": 32100
+    },
+    {
+      "epoch": 1.151274625478208,
+      "grad_norm": 0.281562864780426,
+      "learning_rate": 2.1219028209803713e-05,
+      "loss": 0.03565767288208008,
+      "step": 32200
+    },
+    {
+      "epoch": 1.1548500125138546,
+      "grad_norm": 0.986331582069397,
+      "learning_rate": 2.1129643533912547e-05,
+      "loss": 0.03515695333480835,
+      "step": 32300
+    },
+    {
+      "epoch": 1.1584253995495013,
+      "grad_norm": 1.0339690446853638,
+      "learning_rate": 2.104025885802138e-05,
+      "loss": 0.03650200843811035,
+      "step": 32400
+    },
+    {
+      "epoch": 1.1620007865851478,
+      "grad_norm": 0.6622812747955322,
+      "learning_rate": 2.095087418213022e-05,
+      "loss": 0.03963910102844238,
+      "step": 32500
+    },
+    {
+      "epoch": 1.1620007865851478,
+      "eval_accuracy": 0.9872691132930045,
+      "eval_f1": 0.8901178950048444,
+      "eval_loss": 0.050942763686180115,
+      "eval_precision": 0.8760841419442859,
+      "eval_recall": 0.904608573153552,
+      "eval_runtime": 27.7471,
+      "eval_samples_per_second": 810.896,
+      "eval_steps_per_second": 22.525,
+      "step": 32500
+    },
+    {
+      "epoch": 1.1655761736207944,
+      "grad_norm": 0.4157122075557709,
+      "learning_rate": 2.0861489506239053e-05,
+      "loss": 0.03528056383132935,
+      "step": 32600
+    },
+    {
+      "epoch": 1.169151560656441,
+      "grad_norm": 1.0833650827407837,
+      "learning_rate": 2.0772104830347887e-05,
+      "loss": 0.0338432765007019,
+      "step": 32700
+    },
+    {
+      "epoch": 1.1727269476920876,
+      "grad_norm": 0.6234818696975708,
+      "learning_rate": 2.068272015445672e-05,
+      "loss": 0.03545186996459961,
+      "step": 32800
+    },
+    {
+      "epoch": 1.1763023347277342,
+      "grad_norm": 0.46430152654647827,
+      "learning_rate": 2.0593335478565555e-05,
+      "loss": 0.03938552379608154,
+      "step": 32900
+    },
+    {
+      "epoch": 1.179877721763381,
+      "grad_norm": 0.32441213726997375,
+      "learning_rate": 2.050395080267439e-05,
+      "loss": 0.03765884399414063,
+      "step": 33000
+    },
+    {
+      "epoch": 1.1834531087990274,
+      "grad_norm": 0.5149340033531189,
+      "learning_rate": 2.0414566126783224e-05,
+      "loss": 0.04111374378204346,
+      "step": 33100
+    },
+    {
+      "epoch": 1.1870284958346742,
+      "grad_norm": 0.6311440467834473,
+      "learning_rate": 2.032518145089206e-05,
+      "loss": 0.03235443115234375,
+      "step": 33200
+    },
+    {
+      "epoch": 1.1906038828703207,
+      "grad_norm": 0.41769224405288696,
+      "learning_rate": 2.0235796775000895e-05,
+      "loss": 0.03542477607727051,
+      "step": 33300
+    },
+    {
+      "epoch": 1.1941792699059672,
+      "grad_norm": 1.399487853050232,
+      "learning_rate": 2.014641209910973e-05,
+      "loss": 0.03973909854888916,
+      "step": 33400
+    },
+    {
+      "epoch": 1.197754656941614,
+      "grad_norm": 0.44740626215934753,
+      "learning_rate": 2.0057027423218564e-05,
+      "loss": 0.03419320821762085,
+      "step": 33500
+    },
+    {
+      "epoch": 1.2013300439772605,
+      "grad_norm": 0.7771443128585815,
+      "learning_rate": 1.99676427473274e-05,
+      "loss": 0.03688163042068481,
+      "step": 33600
+    },
+    {
+      "epoch": 1.2049054310129073,
+      "grad_norm": 0.33263227343559265,
+      "learning_rate": 1.9878258071436235e-05,
+      "loss": 0.0361082911491394,
+      "step": 33700
+    },
+    {
+      "epoch": 1.2084808180485538,
+      "grad_norm": 0.586033821105957,
+      "learning_rate": 1.978887339554507e-05,
+      "loss": 0.037032432556152343,
+      "step": 33800
+    },
+    {
+      "epoch": 1.2120562050842003,
+      "grad_norm": 0.17661893367767334,
+      "learning_rate": 1.9699488719653904e-05,
+      "loss": 0.03797416687011719,
+      "step": 33900
+    },
+    {
+      "epoch": 1.215631592119847,
+      "grad_norm": 0.6682581305503845,
+      "learning_rate": 1.9610104043762738e-05,
+      "loss": 0.03688710927963257,
+      "step": 34000
+    },
+    {
+      "epoch": 1.2192069791554936,
+      "grad_norm": 0.33618828654289246,
+      "learning_rate": 1.9520719367871572e-05,
+      "loss": 0.03531041145324707,
+      "step": 34100
+    },
+    {
+      "epoch": 1.2227823661911401,
+      "grad_norm": 0.2299039363861084,
+      "learning_rate": 1.9431334691980406e-05,
+      "loss": 0.037303669452667235,
+      "step": 34200
+    },
+    {
+      "epoch": 1.2263577532267869,
+      "grad_norm": 0.38670745491981506,
+      "learning_rate": 1.934195001608924e-05,
+      "loss": 0.03624207735061646,
+      "step": 34300
+    },
+    {
+      "epoch": 1.2299331402624334,
+      "grad_norm": 0.28273847699165344,
+      "learning_rate": 1.9252565340198078e-05,
+      "loss": 0.03737942218780518,
+      "step": 34400
+    },
+    {
+      "epoch": 1.23350852729808,
+      "grad_norm": 0.1840369552373886,
+      "learning_rate": 1.9163180664306912e-05,
+      "loss": 0.04193697929382324,
+      "step": 34500
+    },
+    {
+      "epoch": 1.2370839143337267,
+      "grad_norm": 0.3581949770450592,
+      "learning_rate": 1.9073795988415746e-05,
+      "loss": 0.03538564205169678,
+      "step": 34600
+    },
+    {
+      "epoch": 1.2406593013693732,
+      "grad_norm": 0.47306036949157715,
+      "learning_rate": 1.898441131252458e-05,
+      "loss": 0.03894999265670776,
+      "step": 34700
+    },
+    {
+      "epoch": 1.24423468840502,
+      "grad_norm": 0.961359977722168,
+      "learning_rate": 1.8895026636633418e-05,
+      "loss": 0.03768787622451782,
+      "step": 34800
+    },
+    {
+      "epoch": 1.2478100754406665,
+      "grad_norm": 0.873396098613739,
+      "learning_rate": 1.8805641960742252e-05,
+      "loss": 0.03798648834228516,
+      "step": 34900
+    },
+    {
+      "epoch": 1.251385462476313,
+      "grad_norm": 0.27755600214004517,
+      "learning_rate": 1.8716257284851086e-05,
+      "loss": 0.03826235771179199,
+      "step": 35000
+    },
+    {
+      "epoch": 1.251385462476313,
+      "eval_accuracy": 0.9878845131214289,
+      "eval_f1": 0.8920698296733638,
+      "eval_loss": 0.04892827197909355,
+      "eval_precision": 0.8788428276516208,
+      "eval_recall": 0.9057010608630653,
+      "eval_runtime": 27.2527,
+      "eval_samples_per_second": 825.607,
+      "eval_steps_per_second": 22.934,
+      "step": 35000
+    },
+    {
+      "epoch": 1.2549608495119597,
+      "grad_norm": 0.19469444453716278,
+      "learning_rate": 1.862687260895992e-05,
+      "loss": 0.037444868087768556,
+      "step": 35100
+    },
+    {
+      "epoch": 1.2585362365476063,
+      "grad_norm": 0.7563005685806274,
+      "learning_rate": 1.8537487933068755e-05,
+      "loss": 0.03585953950881958,
+      "step": 35200
+    },
+    {
+      "epoch": 1.262111623583253,
+      "grad_norm": 0.748693585395813,
+      "learning_rate": 1.844810325717759e-05,
+      "loss": 0.036836111545562746,
+      "step": 35300
+    },
+    {
+      "epoch": 1.2656870106188995,
+      "grad_norm": 0.2749057114124298,
+      "learning_rate": 1.8358718581286423e-05,
+      "loss": 0.035653345584869385,
+      "step": 35400
+    },
+    {
+      "epoch": 1.269262397654546,
+      "grad_norm": 0.46990424394607544,
+      "learning_rate": 1.826933390539526e-05,
+      "loss": 0.038926541805267334,
+      "step": 35500
+    },
+    {
+      "epoch": 1.2728377846901928,
+      "grad_norm": 0.5694590210914612,
+      "learning_rate": 1.8179949229504095e-05,
+      "loss": 0.041200418472290036,
+      "step": 35600
+    },
+    {
+      "epoch": 1.2764131717258393,
+      "grad_norm": 0.44198593497276306,
+      "learning_rate": 1.809056455361293e-05,
+      "loss": 0.03306173086166382,
+      "step": 35700
+    },
+    {
+      "epoch": 1.2799885587614859,
+      "grad_norm": 1.5265918970108032,
+      "learning_rate": 1.8001179877721763e-05,
+      "loss": 0.03929618358612061,
+      "step": 35800
+    },
+    {
+      "epoch": 1.2835639457971326,
+      "grad_norm": 0.568000078201294,
+      "learning_rate": 1.79117952018306e-05,
+      "loss": 0.035215189456939695,
+      "step": 35900
+    },
+    {
+      "epoch": 1.2871393328327791,
+      "grad_norm": 0.3256838619709015,
+      "learning_rate": 1.7822410525939435e-05,
+      "loss": 0.03693248510360718,
+      "step": 36000
+    },
+    {
+      "epoch": 1.2907147198684257,
+      "grad_norm": 0.37276744842529297,
+      "learning_rate": 1.773302585004827e-05,
+      "loss": 0.038254330158233645,
+      "step": 36100
+    },
+    {
+      "epoch": 1.2942901069040724,
+      "grad_norm": 0.9104180335998535,
+      "learning_rate": 1.7643641174157103e-05,
+      "loss": 0.03706887722015381,
+      "step": 36200
+    },
+    {
+      "epoch": 1.297865493939719,
+      "grad_norm": 0.855074942111969,
+      "learning_rate": 1.755425649826594e-05,
+      "loss": 0.039341244697570804,
+      "step": 36300
+    },
+    {
+      "epoch": 1.3014408809753655,
+      "grad_norm": 1.0919744968414307,
+      "learning_rate": 1.7464871822374775e-05,
+      "loss": 0.03796007394790649,
+      "step": 36400
+    },
+    {
+      "epoch": 1.3050162680110122,
+      "grad_norm": 0.4765317142009735,
+      "learning_rate": 1.737548714648361e-05,
+      "loss": 0.03425301790237427,
+      "step": 36500
+    },
+    {
+      "epoch": 1.3085916550466588,
+      "grad_norm": 0.28184378147125244,
+      "learning_rate": 1.728610247059244e-05,
+      "loss": 0.03511073589324951,
+      "step": 36600
+    },
+    {
+      "epoch": 1.3121670420823053,
+      "grad_norm": 0.26926326751708984,
+      "learning_rate": 1.7196717794701277e-05,
+      "loss": 0.03917940616607666,
+      "step": 36700
+    },
+    {
+      "epoch": 1.315742429117952,
+      "grad_norm": 2.2863128185272217,
+      "learning_rate": 1.710733311881011e-05,
+      "loss": 0.03748450517654419,
+      "step": 36800
+    },
+    {
+      "epoch": 1.3193178161535986,
+      "grad_norm": 0.47158753871917725,
+      "learning_rate": 1.7017948442918946e-05,
+      "loss": 0.034841620922088624,
+      "step": 36900
+    },
+    {
+      "epoch": 1.3228932031892453,
+      "grad_norm": 0.3611966371536255,
+      "learning_rate": 1.692856376702778e-05,
+      "loss": 0.03597846508026123,
+      "step": 37000
+    },
+    {
+      "epoch": 1.3264685902248918,
+      "grad_norm": 0.19897930324077606,
+      "learning_rate": 1.6839179091136617e-05,
+      "loss": 0.0384373140335083,
+      "step": 37100
+    },
+    {
+      "epoch": 1.3300439772605386,
+      "grad_norm": 0.4929654002189636,
+      "learning_rate": 1.674979441524545e-05,
+      "loss": 0.03474846363067627,
+      "step": 37200
+    },
+    {
+      "epoch": 1.333619364296185,
+      "grad_norm": 1.4330233335494995,
+      "learning_rate": 1.6660409739354286e-05,
+      "loss": 0.03804588317871094,
+      "step": 37300
+    },
+    {
+      "epoch": 1.3371947513318316,
+      "grad_norm": 0.7935028076171875,
+      "learning_rate": 1.6571025063463123e-05,
+      "loss": 0.036091580390930175,
+      "step": 37400
+    },
+    {
+      "epoch": 1.3407701383674784,
+      "grad_norm": 0.6093057990074158,
+      "learning_rate": 1.6481640387571957e-05,
+      "loss": 0.036958491802215575,
+      "step": 37500
+    },
+    {
+      "epoch": 1.3407701383674784,
+      "eval_accuracy": 0.9876767499314908,
+      "eval_f1": 0.8963199795830114,
+      "eval_loss": 0.048646602779626846,
+      "eval_precision": 0.8842404151455387,
+      "eval_recall": 0.9087341517407929,
+      "eval_runtime": 27.7954,
+      "eval_samples_per_second": 809.486,
+      "eval_steps_per_second": 22.486,
+      "step": 37500
+    },
+    {
+      "epoch": 1.344345525403125,
+      "grad_norm": 0.530693531036377,
+      "learning_rate": 1.639225571168079e-05,
+      "loss": 0.040179696083068844,
+      "step": 37600
+    },
+    {
+      "epoch": 1.3479209124387714,
+      "grad_norm": 0.70650714635849,
+      "learning_rate": 1.6302871035789626e-05,
+      "loss": 0.03599003076553345,
+      "step": 37700
+    },
+    {
+      "epoch": 1.3514962994744182,
+      "grad_norm": 0.673740029335022,
+      "learning_rate": 1.621348635989846e-05,
+      "loss": 0.03707956552505493,
+      "step": 37800
+    },
+    {
+      "epoch": 1.3550716865100647,
+      "grad_norm": 0.28047823905944824,
+      "learning_rate": 1.6124101684007294e-05,
+      "loss": 0.034383256435394284,
+      "step": 37900
+    },
+    {
+      "epoch": 1.3586470735457112,
+      "grad_norm": 0.4644497036933899,
+      "learning_rate": 1.6034717008116128e-05,
+      "loss": 0.039096081256866456,
+      "step": 38000
+    },
+    {
+      "epoch": 1.362222460581358,
+      "grad_norm": 0.2905023992061615,
+      "learning_rate": 1.5945332332224962e-05,
+      "loss": 0.031935737133026124,
+      "step": 38100
+    },
+    {
+      "epoch": 1.3657978476170045,
+      "grad_norm": 0.519289493560791,
+      "learning_rate": 1.58559476563338e-05,
+      "loss": 0.03160768747329712,
+      "step": 38200
+    },
+    {
+      "epoch": 1.369373234652651,
+      "grad_norm": 0.4803026616573334,
+      "learning_rate": 1.5766562980442634e-05,
+      "loss": 0.03475278615951538,
+      "step": 38300
+    },
+    {
+      "epoch": 1.3729486216882978,
+      "grad_norm": 0.2219659686088562,
+      "learning_rate": 1.5677178304551468e-05,
+      "loss": 0.03382747411727905,
+      "step": 38400
+    },
+    {
+      "epoch": 1.3765240087239443,
+      "grad_norm": 0.9020390510559082,
+      "learning_rate": 1.5587793628660302e-05,
+      "loss": 0.03778740644454956,
+      "step": 38500
+    },
+    {
+      "epoch": 1.3800993957595908,
+      "grad_norm": 0.4074041247367859,
+      "learning_rate": 1.549840895276914e-05,
+      "loss": 0.03417648077011108,
+      "step": 38600
+    },
+    {
+      "epoch": 1.3836747827952376,
+      "grad_norm": 0.2950891852378845,
+      "learning_rate": 1.5409024276877974e-05,
+      "loss": 0.037335121631622316,
+      "step": 38700
+    },
+    {
+      "epoch": 1.3872501698308841,
+      "grad_norm": 0.5112789869308472,
+      "learning_rate": 1.5319639600986808e-05,
+      "loss": 0.03443581342697143,
+      "step": 38800
+    },
+    {
+      "epoch": 1.3908255568665309,
+      "grad_norm": 0.6883418560028076,
+      "learning_rate": 1.523025492509564e-05,
+      "loss": 0.03647557497024536,
+      "step": 38900
+    },
+    {
+      "epoch": 1.3944009439021774,
+      "grad_norm": 0.22857694327831268,
+      "learning_rate": 1.5140870249204478e-05,
+      "loss": 0.03520648956298828,
+      "step": 39000
+    },
+    {
+      "epoch": 1.3979763309378241,
+      "grad_norm": 1.4312663078308105,
+      "learning_rate": 1.5051485573313312e-05,
+      "loss": 0.031425106525421145,
+      "step": 39100
+    },
+    {
+      "epoch": 1.4015517179734707,
+      "grad_norm": 0.7821195125579834,
+      "learning_rate": 1.4962100897422146e-05,
+      "loss": 0.03315335750579834,
+      "step": 39200
+    },
+    {
+      "epoch": 1.4051271050091172,
+      "grad_norm": 0.27848535776138306,
+      "learning_rate": 1.487271622153098e-05,
+      "loss": 0.033316426277160645,
+      "step": 39300
+    },
+    {
+      "epoch": 1.408702492044764,
+      "grad_norm": 0.6713240146636963,
+      "learning_rate": 1.4783331545639816e-05,
+      "loss": 0.033266935348510746,
+      "step": 39400
+    },
+    {
+      "epoch": 1.4122778790804105,
+      "grad_norm": 3.596701145172119,
+      "learning_rate": 1.469394686974865e-05,
+      "loss": 0.03419414281845093,
+      "step": 39500
+    },
+    {
+      "epoch": 1.415853266116057,
+      "grad_norm": 1.069840908050537,
+      "learning_rate": 1.4604562193857485e-05,
+      "loss": 0.035397300720214846,
+      "step": 39600
+    },
+    {
+      "epoch": 1.4194286531517037,
+      "grad_norm": 0.2466162145137787,
+      "learning_rate": 1.4515177517966322e-05,
+      "loss": 0.0358107590675354,
+      "step": 39700
+    },
+    {
+      "epoch": 1.4230040401873503,
+      "grad_norm": 0.5182567834854126,
+      "learning_rate": 1.4425792842075156e-05,
+      "loss": 0.03377439260482788,
+      "step": 39800
+    },
+    {
+      "epoch": 1.4265794272229968,
+      "grad_norm": 0.8782963752746582,
+      "learning_rate": 1.433640816618399e-05,
+      "loss": 0.03737137794494629,
+      "step": 39900
+    },
+    {
+      "epoch": 1.4301548142586435,
+      "grad_norm": 0.2662527561187744,
+      "learning_rate": 1.4247023490292825e-05,
+      "loss": 0.035046143531799315,
+      "step": 40000
+    },
+    {
+      "epoch": 1.4301548142586435,
+      "eval_accuracy": 0.9874205963276935,
+      "eval_f1": 0.8909038185431681,
+      "eval_loss": 0.048918217420578,
+      "eval_precision": 0.8769057265778372,
+      "eval_recall": 0.9053560647442717,
+      "eval_runtime": 27.308,
+      "eval_samples_per_second": 823.934,
+      "eval_steps_per_second": 22.887,
+      "step": 40000
+    },
+    {
+      "epoch": 1.43373020129429,
+      "grad_norm": 4.632917404174805,
+      "learning_rate": 1.415763881440166e-05,
+      "loss": 0.03318638563156128,
+      "step": 40100
+    },
+    {
+      "epoch": 1.4373055883299366,
+      "grad_norm": 0.34400591254234314,
+      "learning_rate": 1.4068254138510495e-05,
+      "loss": 0.03380630970001221,
+      "step": 40200
+    },
+    {
+      "epoch": 1.4408809753655833,
+      "grad_norm": 0.3949352204799652,
+      "learning_rate": 1.3978869462619329e-05,
+      "loss": 0.035887646675109866,
+      "step": 40300
+    },
+    {
+      "epoch": 1.4444563624012299,
+      "grad_norm": 0.21083228290081024,
+      "learning_rate": 1.3889484786728163e-05,
+      "loss": 0.02981067180633545,
+      "step": 40400
+    },
+    {
+      "epoch": 1.4480317494368766,
+      "grad_norm": 0.5403398871421814,
+      "learning_rate": 1.3800100110836999e-05,
+      "loss": 0.03951683759689331,
+      "step": 40500
+    },
+    {
+      "epoch": 1.4516071364725232,
+      "grad_norm": 0.37334415316581726,
+      "learning_rate": 1.3710715434945833e-05,
+      "loss": 0.03376241683959961,
+      "step": 40600
+    },
+    {
+      "epoch": 1.4551825235081697,
+      "grad_norm": 0.6374111771583557,
+      "learning_rate": 1.3621330759054667e-05,
+      "loss": 0.035758087635040285,
+      "step": 40700
+    },
+    {
+      "epoch": 1.4587579105438164,
+      "grad_norm": 0.4704621434211731,
+      "learning_rate": 1.3531946083163501e-05,
+      "loss": 0.03579946041107178,
+      "step": 40800
+    },
+    {
+      "epoch": 1.462333297579463,
+      "grad_norm": 0.31890979409217834,
+      "learning_rate": 1.3442561407272339e-05,
+      "loss": 0.036891818046569824,
+      "step": 40900
+    },
+    {
+      "epoch": 1.4659086846151097,
+      "grad_norm": 0.36003023386001587,
+      "learning_rate": 1.3353176731381173e-05,
+      "loss": 0.03722346544265747,
+      "step": 41000
+    },
+    {
+      "epoch": 1.4694840716507562,
+      "grad_norm": 0.3868881165981293,
+      "learning_rate": 1.3263792055490007e-05,
+      "loss": 0.03188649654388428,
+      "step": 41100
+    },
+    {
+      "epoch": 1.4730594586864028,
+      "grad_norm": 0.1989583820104599,
+      "learning_rate": 1.3174407379598841e-05,
+      "loss": 0.03385810136795044,
+      "step": 41200
+    },
+    {
+      "epoch": 1.4766348457220495,
+      "grad_norm": 1.653865933418274,
+      "learning_rate": 1.3085022703707677e-05,
+      "loss": 0.033811585903167726,
+      "step": 41300
+    },
+    {
+      "epoch": 1.480210232757696,
+      "grad_norm": 0.4005359709262848,
+      "learning_rate": 1.2995638027816512e-05,
+      "loss": 0.034179413318634035,
+      "step": 41400
+    },
+    {
+      "epoch": 1.4837856197933426,
+      "grad_norm": 0.40698060393333435,
+      "learning_rate": 1.2906253351925346e-05,
+      "loss": 0.0344992733001709,
+      "step": 41500
+    },
+    {
+      "epoch": 1.4873610068289893,
+      "grad_norm": 0.23063120245933533,
+      "learning_rate": 1.281686867603418e-05,
+      "loss": 0.036886801719665525,
+      "step": 41600
+    },
+    {
+      "epoch": 1.4909363938646358,
+      "grad_norm": 0.36372461915016174,
+      "learning_rate": 1.2727484000143017e-05,
+      "loss": 0.03418808460235596,
+      "step": 41700
+    },
+    {
+      "epoch": 1.4945117809002824,
+      "grad_norm": 3.4656498432159424,
+      "learning_rate": 1.2638099324251852e-05,
+      "loss": 0.035131211280822756,
+      "step": 41800
+    },
+    {
+      "epoch": 1.498087167935929,
+      "grad_norm": 0.5397525429725647,
+      "learning_rate": 1.2548714648360686e-05,
+      "loss": 0.032310936450958255,
+      "step": 41900
+    },
+    {
+      "epoch": 1.5016625549715756,
+      "grad_norm": 0.803663969039917,
+      "learning_rate": 1.245932997246952e-05,
+      "loss": 0.034068484306335446,
+      "step": 42000
+    },
+    {
+      "epoch": 1.5052379420072222,
+      "grad_norm": 0.44578149914741516,
+      "learning_rate": 1.2369945296578356e-05,
+      "loss": 0.033568575382232665,
+      "step": 42100
+    },
+    {
+      "epoch": 1.508813329042869,
+      "grad_norm": 0.3740385174751282,
+      "learning_rate": 1.228056062068719e-05,
+      "loss": 0.0316014552116394,
+      "step": 42200
+    },
+    {
+      "epoch": 1.5123887160785157,
+      "grad_norm": 0.7885581254959106,
+      "learning_rate": 1.2191175944796026e-05,
+      "loss": 0.036092112064361574,
+      "step": 42300
+    },
+    {
+      "epoch": 1.515964103114162,
+      "grad_norm": 0.2616823613643646,
+      "learning_rate": 1.210179126890486e-05,
+      "loss": 0.0364843225479126,
+      "step": 42400
+    },
+    {
+      "epoch": 1.5195394901498087,
+      "grad_norm": 1.1933097839355469,
+      "learning_rate": 1.2012406593013694e-05,
+      "loss": 0.032956657409667967,
+      "step": 42500
+    },
+    {
+      "epoch": 1.5195394901498087,
+      "eval_accuracy": 0.9878976626904123,
+      "eval_f1": 0.8964901338171921,
+      "eval_loss": 0.04782980680465698,
+      "eval_precision": 0.8842314252957132,
+      "eval_recall": 0.9090935226978697,
+      "eval_runtime": 27.8912,
+      "eval_samples_per_second": 806.706,
+      "eval_steps_per_second": 22.409,
+      "step": 42500
+    },
+    {
+      "epoch": 1.5231148771854555,
+      "grad_norm": 1.002236247062683,
+      "learning_rate": 1.1923021917122528e-05,
+      "loss": 0.03267708301544189,
+      "step": 42600
+    },
+    {
+      "epoch": 1.526690264221102,
+      "grad_norm": 0.2965432405471802,
+      "learning_rate": 1.1833637241231364e-05,
+      "loss": 0.03969228982925415,
+      "step": 42700
+    },
+    {
+      "epoch": 1.5302656512567485,
+      "grad_norm": 0.35980096459388733,
+      "learning_rate": 1.1744252565340198e-05,
+      "loss": 0.033807692527770994,
+      "step": 42800
+    },
+    {
+      "epoch": 1.5338410382923953,
+      "grad_norm": 0.4036603271961212,
+      "learning_rate": 1.1654867889449034e-05,
+      "loss": 0.036050264835357664,
+      "step": 42900
+    },
+    {
+      "epoch": 1.5374164253280418,
+      "grad_norm": 0.4341689348220825,
+      "learning_rate": 1.1565483213557868e-05,
+      "loss": 0.03344399690628052,
+      "step": 43000
+    },
+    {
+      "epoch": 1.5409918123636883,
+      "grad_norm": 0.35666847229003906,
+      "learning_rate": 1.1476098537666702e-05,
+      "loss": 0.035790588855743405,
+      "step": 43100
+    },
+    {
+      "epoch": 1.544567199399335,
+      "grad_norm": 2.009552001953125,
+      "learning_rate": 1.1386713861775537e-05,
+      "loss": 0.03580213069915771,
+      "step": 43200
+    },
+    {
+      "epoch": 1.5481425864349816,
+      "grad_norm": 0.9199197888374329,
+      "learning_rate": 1.1297329185884372e-05,
+      "loss": 0.035557851791381836,
+      "step": 43300
+    },
+    {
+      "epoch": 1.5517179734706281,
+      "grad_norm": 0.3379763662815094,
+      "learning_rate": 1.1207944509993207e-05,
+      "loss": 0.037502107620239256,
+      "step": 43400
+    },
+    {
+      "epoch": 1.5552933605062749,
+      "grad_norm": 0.4002296030521393,
+      "learning_rate": 1.1118559834102042e-05,
+      "loss": 0.03514168262481689,
+      "step": 43500
+    },
+    {
+      "epoch": 1.5588687475419214,
+      "grad_norm": 0.44335803389549255,
+      "learning_rate": 1.1029175158210877e-05,
+      "loss": 0.03210949659347534,
+      "step": 43600
+    },
+    {
+      "epoch": 1.562444134577568,
+      "grad_norm": 0.3367313742637634,
+      "learning_rate": 1.0939790482319712e-05,
+      "loss": 0.03381946325302124,
+      "step": 43700
+    },
+    {
+      "epoch": 1.5660195216132147,
+      "grad_norm": 0.3180839419364929,
+      "learning_rate": 1.0850405806428547e-05,
+      "loss": 0.033136572837829587,
+      "step": 43800
+    },
+    {
+      "epoch": 1.5695949086488612,
+      "grad_norm": 0.49929025769233704,
+      "learning_rate": 1.076102113053738e-05,
+      "loss": 0.03284239530563354,
+      "step": 43900
+    },
+    {
+      "epoch": 1.5731702956845077,
+      "grad_norm": 0.36956411600112915,
+      "learning_rate": 1.0671636454646217e-05,
+      "loss": 0.032391068935394285,
+      "step": 44000
+    },
+    {
+      "epoch": 1.5767456827201545,
+      "grad_norm": 0.3806305527687073,
+      "learning_rate": 1.058225177875505e-05,
+      "loss": 0.03159698247909546,
+      "step": 44100
+    },
+    {
+      "epoch": 1.5803210697558012,
+      "grad_norm": 0.24886535108089447,
+      "learning_rate": 1.0492867102863887e-05,
+      "loss": 0.03376968622207641,
+      "step": 44200
+    },
+    {
+      "epoch": 1.5838964567914475,
+      "grad_norm": 0.8062007427215576,
+      "learning_rate": 1.040348242697272e-05,
+      "loss": 0.031125342845916747,
+      "step": 44300
+    },
+    {
+      "epoch": 1.5874718438270943,
+      "grad_norm": 0.32632651925086975,
+      "learning_rate": 1.0314097751081555e-05,
+      "loss": 0.032405462265014645,
+      "step": 44400
+    },
+    {
+      "epoch": 1.591047230862741,
+      "grad_norm": 0.9697968363761902,
+      "learning_rate": 1.0224713075190389e-05,
+      "loss": 0.0316835880279541,
+      "step": 44500
+    },
+    {
+      "epoch": 1.5946226178983876,
+      "grad_norm": 0.7041149735450745,
+      "learning_rate": 1.0135328399299225e-05,
+      "loss": 0.03227449417114258,
+      "step": 44600
+    },
+    {
+      "epoch": 1.598198004934034,
+      "grad_norm": 1.0169494152069092,
+      "learning_rate": 1.0045943723408059e-05,
+      "loss": 0.03636837244033814,
+      "step": 44700
+    },
+    {
+      "epoch": 1.6017733919696808,
+      "grad_norm": 1.4278594255447388,
+      "learning_rate": 9.956559047516895e-06,
+      "loss": 0.036050994396209714,
+      "step": 44800
+    },
+    {
+      "epoch": 1.6053487790053274,
+      "grad_norm": 0.21218614280223846,
+      "learning_rate": 9.867174371625729e-06,
+      "loss": 0.03155009746551514,
+      "step": 44900
+    },
+    {
+      "epoch": 1.6089241660409739,
+      "grad_norm": 0.2901414930820465,
+      "learning_rate": 9.777789695734563e-06,
+      "loss": 0.030832624435424803,
+      "step": 45000
+    },
+    {
+      "epoch": 1.6089241660409739,
+      "eval_accuracy": 0.9888076128640655,
+      "eval_f1": 0.9007828635915198,
+      "eval_loss": 0.04577971622347832,
+      "eval_precision": 0.8896631009295218,
+      "eval_recall": 0.9121841129287296,
+      "eval_runtime": 27.4639,
+      "eval_samples_per_second": 819.259,
+      "eval_steps_per_second": 22.757,
+      "step": 45000
+    },
+    {
+      "epoch": 1.6124995530766206,
+      "grad_norm": 0.6742628812789917,
+      "learning_rate": 9.688405019843397e-06,
+      "loss": 0.03396400213241577,
+      "step": 45100
+    },
+    {
+      "epoch": 1.6160749401122672,
+      "grad_norm": 0.30497708916664124,
+      "learning_rate": 9.599020343952233e-06,
+      "loss": 0.030751326084136964,
+      "step": 45200
+    },
+    {
+      "epoch": 1.6196503271479137,
+      "grad_norm": 0.33833158016204834,
+      "learning_rate": 9.509635668061067e-06,
+      "loss": 0.03044323444366455,
+      "step": 45300
+    },
+    {
+      "epoch": 1.6232257141835604,
+      "grad_norm": 0.35390418767929077,
+      "learning_rate": 9.420250992169903e-06,
+      "loss": 0.03425618410110474,
+      "step": 45400
+    },
+    {
+      "epoch": 1.626801101219207,
+      "grad_norm": 0.6008805632591248,
+      "learning_rate": 9.330866316278737e-06,
+      "loss": 0.03422411203384399,
+      "step": 45500
+    },
+    {
+      "epoch": 1.6303764882548535,
+      "grad_norm": 0.7057814598083496,
+      "learning_rate": 9.241481640387573e-06,
+      "loss": 0.03580734968185425,
+      "step": 45600
+    },
+    {
+      "epoch": 1.6339518752905002,
+      "grad_norm": 0.6222581267356873,
+      "learning_rate": 9.152096964496407e-06,
+      "loss": 0.03245258092880249,
+      "step": 45700
+    },
+    {
+      "epoch": 1.6375272623261468,
+      "grad_norm": 0.19113455712795258,
+      "learning_rate": 9.062712288605242e-06,
+      "loss": 0.03314180135726929,
+      "step": 45800
+    },
+    {
+      "epoch": 1.6411026493617933,
+      "grad_norm": 0.35139983892440796,
+      "learning_rate": 8.973327612714076e-06,
+      "loss": 0.03314854860305786,
+      "step": 45900
+    },
+    {
+      "epoch": 1.64467803639744,
+      "grad_norm": 2.3638358116149902,
+      "learning_rate": 8.883942936822912e-06,
+      "loss": 0.03374920845031738,
+      "step": 46000
+    },
+    {
+      "epoch": 1.6482534234330868,
+      "grad_norm": 0.3906150162220001,
+      "learning_rate": 8.794558260931746e-06,
+      "loss": 0.030902385711669922,
+      "step": 46100
+    },
+    {
+      "epoch": 1.651828810468733,
+      "grad_norm": 1.5684771537780762,
+      "learning_rate": 8.705173585040582e-06,
+      "loss": 0.03261609077453613,
+      "step": 46200
+    },
+    {
+      "epoch": 1.6554041975043798,
+      "grad_norm": 0.5489705801010132,
+      "learning_rate": 8.615788909149416e-06,
+      "loss": 0.032475869655609134,
+      "step": 46300
+    },
+    {
+      "epoch": 1.6589795845400266,
+      "grad_norm": 0.4629211127758026,
+      "learning_rate": 8.52640423325825e-06,
+      "loss": 0.0343438458442688,
+      "step": 46400
+    },
+    {
+      "epoch": 1.6625549715756731,
+      "grad_norm": 0.35416728258132935,
+      "learning_rate": 8.437019557367086e-06,
+      "loss": 0.030291988849639892,
+      "step": 46500
+    },
+    {
+      "epoch": 1.6661303586113196,
+      "grad_norm": 0.3730672597885132,
+      "learning_rate": 8.34763488147592e-06,
+      "loss": 0.03114586353302002,
+      "step": 46600
+    },
+    {
+      "epoch": 1.6697057456469664,
+      "grad_norm": 0.8023098111152649,
+      "learning_rate": 8.258250205584756e-06,
+      "loss": 0.031157519817352295,
+      "step": 46700
+    },
+    {
+      "epoch": 1.673281132682613,
+      "grad_norm": 0.3616831600666046,
+      "learning_rate": 8.16886552969359e-06,
+      "loss": 0.03633548498153687,
+      "step": 46800
+    },
+    {
+      "epoch": 1.6768565197182594,
+      "grad_norm": 0.2969978451728821,
+      "learning_rate": 8.079480853802424e-06,
+      "loss": 0.030888726711273195,
+      "step": 46900
+    },
+    {
+      "epoch": 1.6804319067539062,
+      "grad_norm": 0.5954911708831787,
+      "learning_rate": 7.990096177911258e-06,
+      "loss": 0.02800543785095215,
+      "step": 47000
+    },
+    {
+      "epoch": 1.6840072937895527,
+      "grad_norm": 0.28519004583358765,
+      "learning_rate": 7.900711502020094e-06,
+      "loss": 0.0348360013961792,
+      "step": 47100
+    },
+    {
+      "epoch": 1.6875826808251992,
+      "grad_norm": 3.0812149047851562,
+      "learning_rate": 7.811326826128928e-06,
+      "loss": 0.03429551839828491,
+      "step": 47200
+    },
+    {
+      "epoch": 1.691158067860846,
+      "grad_norm": 0.3664245903491974,
+      "learning_rate": 7.721942150237764e-06,
+      "loss": 0.03342988014221191,
+      "step": 47300
+    },
+    {
+      "epoch": 1.6947334548964925,
+      "grad_norm": 0.4746117889881134,
+      "learning_rate": 7.632557474346598e-06,
+      "loss": 0.031046552658081053,
+      "step": 47400
+    },
+    {
+      "epoch": 1.698308841932139,
+      "grad_norm": 0.26298218965530396,
+      "learning_rate": 7.543172798455433e-06,
+      "loss": 0.03168731689453125,
+      "step": 47500
+    },
+    {
+      "epoch": 1.698308841932139,
+      "eval_accuracy": 0.9886813770018247,
+      "eval_f1": 0.8991618091307493,
+      "eval_loss": 0.04538652300834656,
+      "eval_precision": 0.8872949672507418,
+      "eval_recall": 0.9113503723083115,
+      "eval_runtime": 27.7648,
+      "eval_samples_per_second": 810.377,
+      "eval_steps_per_second": 22.51,
+      "step": 47500
+    },
+    {
+      "epoch": 1.7018842289677858,
+      "grad_norm": 1.6149009466171265,
+      "learning_rate": 7.4537881225642675e-06,
+      "loss": 0.035295097827911376,
+      "step": 47600
+    },
+    {
+      "epoch": 1.7054596160034323,
+      "grad_norm": 0.37669169902801514,
+      "learning_rate": 7.3644034466731025e-06,
+      "loss": 0.03364665269851685,
+      "step": 47700
+    },
+    {
+      "epoch": 1.7090350030390788,
+      "grad_norm": 0.5029271841049194,
+      "learning_rate": 7.275018770781937e-06,
+      "loss": 0.032778596878051756,
+      "step": 47800
+    },
+    {
+      "epoch": 1.7126103900747256,
+      "grad_norm": 0.265184611082077,
+      "learning_rate": 7.1856340948907725e-06,
+      "loss": 0.033551807403564456,
+      "step": 47900
+    },
+    {
+      "epoch": 1.7161857771103723,
+      "grad_norm": 0.5929502248764038,
+      "learning_rate": 7.096249418999607e-06,
+      "loss": 0.033371658325195314,
+      "step": 48000
+    },
+    {
+      "epoch": 1.7197611641460187,
+      "grad_norm": 0.6151393055915833,
+      "learning_rate": 7.006864743108442e-06,
+      "loss": 0.034622840881347657,
+      "step": 48100
+    },
+    {
+      "epoch": 1.7233365511816654,
+      "grad_norm": Infinity,
+      "learning_rate": 6.917480067217276e-06,
+      "loss": 0.032550268173217774,
+      "step": 48200
+    },
+    {
+      "epoch": 1.7269119382173121,
+      "grad_norm": 3.7852137088775635,
+      "learning_rate": 6.828095391326112e-06,
+      "loss": 0.03138866424560547,
+      "step": 48300
+    },
+    {
+      "epoch": 1.7304873252529587,
+      "grad_norm": 0.1753600835800171,
+      "learning_rate": 6.738710715434946e-06,
+      "loss": 0.03186697244644165,
+      "step": 48400
+    },
+    {
+      "epoch": 1.7340627122886052,
+      "grad_norm": 6.609533786773682,
+      "learning_rate": 6.649326039543781e-06,
+      "loss": 0.031199581623077392,
+      "step": 48500
+    },
+    {
+      "epoch": 1.737638099324252,
+      "grad_norm": 1.9689279794692993,
+      "learning_rate": 6.559941363652617e-06,
+      "loss": 0.03473323583602905,
+      "step": 48600
+    },
+    {
+      "epoch": 1.7412134863598985,
+      "grad_norm": 1.0971671342849731,
+      "learning_rate": 6.47055668776145e-06,
+      "loss": 0.031001167297363283,
+      "step": 48700
+    },
+    {
+      "epoch": 1.744788873395545,
+      "grad_norm": 0.5941652655601501,
+      "learning_rate": 6.381172011870286e-06,
+      "loss": 0.03148573875427246,
+      "step": 48800
+    },
+    {
+      "epoch": 1.7483642604311918,
+      "grad_norm": 1.0142033100128174,
+      "learning_rate": 6.29178733597912e-06,
+      "loss": 0.03321949720382691,
+      "step": 48900
+    },
+    {
+      "epoch": 1.7519396474668383,
+      "grad_norm": 1.1377204656600952,
+      "learning_rate": 6.202402660087954e-06,
+      "loss": 0.03343360424041748,
+      "step": 49000
+    },
+    {
+      "epoch": 1.7555150345024848,
+      "grad_norm": 0.5484851002693176,
+      "learning_rate": 6.113017984196789e-06,
+      "loss": 0.03009215831756592,
+      "step": 49100
+    },
+    {
+      "epoch": 1.7590904215381316,
+      "grad_norm": 0.4845998287200928,
+      "learning_rate": 6.023633308305624e-06,
+      "loss": 0.03416025161743164,
+      "step": 49200
+    },
+    {
+      "epoch": 1.762665808573778,
+      "grad_norm": 2.4999592304229736,
+      "learning_rate": 5.934248632414459e-06,
+      "loss": 0.03311382532119751,
+      "step": 49300
+    },
+    {
+      "epoch": 1.7662411956094246,
+      "grad_norm": 0.8577232956886292,
+      "learning_rate": 5.844863956523293e-06,
+      "loss": 0.030206308364868165,
+      "step": 49400
+    },
+    {
+      "epoch": 1.7698165826450714,
+      "grad_norm": 0.90534508228302,
+      "learning_rate": 5.755479280632128e-06,
+      "loss": 0.03304917335510254,
+      "step": 49500
+    },
+    {
+      "epoch": 1.7733919696807179,
+      "grad_norm": 0.4702795445919037,
+      "learning_rate": 5.666094604740963e-06,
+      "loss": 0.03289535760879517,
+      "step": 49600
+    },
+    {
+      "epoch": 1.7769673567163644,
+      "grad_norm": 0.3340344727039337,
+      "learning_rate": 5.5767099288497984e-06,
+      "loss": 0.03143750667572021,
+      "step": 49700
+    },
+    {
+      "epoch": 1.7805427437520112,
+      "grad_norm": 0.8033680319786072,
+      "learning_rate": 5.4873252529586334e-06,
+      "loss": 0.03799154043197632,
+      "step": 49800
+    },
+    {
+      "epoch": 1.784118130787658,
+      "grad_norm": 0.3498431444168091,
+      "learning_rate": 5.3979405770674684e-06,
+      "loss": 0.032227945327758786,
+      "step": 49900
+    },
+    {
+      "epoch": 1.7876935178233042,
+      "grad_norm": 0.5044463276863098,
+      "learning_rate": 5.308555901176303e-06,
+      "loss": 0.03224561214447021,
+      "step": 50000
+    },
+    {
+      "epoch": 1.7876935178233042,
+      "eval_accuracy": 0.9888107687606216,
+      "eval_f1": 0.9007455797770362,
+      "eval_loss": 0.04468328878283501,
+      "eval_precision": 0.890014593623709,
+      "eval_recall": 0.9117384929419544,
+      "eval_runtime": 27.446,
+      "eval_samples_per_second": 819.793,
+      "eval_steps_per_second": 22.772,
+      "step": 50000
+    },
+    {
+      "epoch": 1.791268904858951,
+      "grad_norm": 0.2692296504974365,
+      "learning_rate": 5.219171225285138e-06,
+      "loss": 0.03332348108291626,
+      "step": 50100
+    },
+    {
+      "epoch": 1.7948442918945977,
+      "grad_norm": 0.29106396436691284,
+      "learning_rate": 5.129786549393973e-06,
+      "loss": 0.032147047519683836,
+      "step": 50200
+    },
+    {
+      "epoch": 1.7984196789302442,
+      "grad_norm": 0.20724542438983917,
+      "learning_rate": 5.040401873502807e-06,
+      "loss": 0.02886124849319458,
+      "step": 50300
+    },
+    {
+      "epoch": 1.8019950659658908,
+      "grad_norm": 0.7092130184173584,
+      "learning_rate": 4.951017197611642e-06,
+      "loss": 0.033159823417663575,
+      "step": 50400
+    },
+    {
+      "epoch": 1.8055704530015375,
+      "grad_norm": 0.432674765586853,
+      "learning_rate": 4.861632521720477e-06,
+      "loss": 0.03299700260162353,
+      "step": 50500
+    },
+    {
+      "epoch": 1.809145840037184,
+      "grad_norm": 0.9785314798355103,
+      "learning_rate": 4.772247845829311e-06,
+      "loss": 0.03019791841506958,
+      "step": 50600
+    },
+    {
+      "epoch": 1.8127212270728306,
+      "grad_norm": 0.5002002120018005,
+      "learning_rate": 4.682863169938146e-06,
+      "loss": 0.035624983310699465,
+      "step": 50700
+    },
+    {
+      "epoch": 1.8162966141084773,
+      "grad_norm": 0.765285313129425,
+      "learning_rate": 4.593478494046981e-06,
+      "loss": 0.02971407175064087,
+      "step": 50800
+    },
+    {
+      "epoch": 1.8198720011441238,
+      "grad_norm": 0.534965991973877,
+      "learning_rate": 4.504093818155815e-06,
+      "loss": 0.03354018688201904,
+      "step": 50900
+    },
+    {
+      "epoch": 1.8234473881797704,
+      "grad_norm": 0.7223150134086609,
+      "learning_rate": 4.41470914226465e-06,
+      "loss": 0.02953230619430542,
+      "step": 51000
+    },
+    {
+      "epoch": 1.8270227752154171,
+      "grad_norm": 0.38850611448287964,
+      "learning_rate": 4.325324466373485e-06,
+      "loss": 0.030534558296203614,
+      "step": 51100
+    },
+    {
+      "epoch": 1.8305981622510636,
+      "grad_norm": 0.36119019985198975,
+      "learning_rate": 4.23593979048232e-06,
+      "loss": 0.030811927318572997,
+      "step": 51200
+    },
+    {
+      "epoch": 1.8341735492867102,
+      "grad_norm": 0.4112676978111267,
+      "learning_rate": 4.146555114591154e-06,
+      "loss": 0.036168689727783206,
+      "step": 51300
+    },
+    {
+      "epoch": 1.837748936322357,
+      "grad_norm": 0.38200223445892334,
+      "learning_rate": 4.057170438699989e-06,
+      "loss": 0.03023934841156006,
+      "step": 51400
+    },
+    {
+      "epoch": 1.8413243233580037,
+      "grad_norm": 0.22987698018550873,
+      "learning_rate": 3.967785762808824e-06,
+      "loss": 0.03280112981796265,
+      "step": 51500
+    },
+    {
+      "epoch": 1.84489971039365,
+      "grad_norm": 0.5126951336860657,
+      "learning_rate": 3.8784010869176585e-06,
+      "loss": 0.032214133739471434,
+      "step": 51600
+    },
+    {
+      "epoch": 1.8484750974292967,
+      "grad_norm": 0.3394624888896942,
+      "learning_rate": 3.7890164110264935e-06,
+      "loss": 0.0288789963722229,
+      "step": 51700
+    },
+    {
+      "epoch": 1.8520504844649435,
+      "grad_norm": 0.8338372111320496,
+      "learning_rate": 3.699631735135328e-06,
+      "loss": 0.03252574443817138,
+      "step": 51800
+    },
+    {
+      "epoch": 1.8556258715005898,
+      "grad_norm": 0.2515293061733246,
+      "learning_rate": 3.6102470592441635e-06,
+      "loss": 0.029772815704345704,
+      "step": 51900
+    },
+    {
+      "epoch": 1.8592012585362365,
+      "grad_norm": 0.5206916332244873,
+      "learning_rate": 3.5208623833529985e-06,
+      "loss": 0.030335335731506347,
+      "step": 52000
+    },
+    {
+      "epoch": 1.8627766455718833,
+      "grad_norm": 2.3129968643188477,
+      "learning_rate": 3.431477707461833e-06,
+      "loss": 0.032417423725128174,
+      "step": 52100
+    },
+    {
+      "epoch": 1.8663520326075298,
+      "grad_norm": 1.627025842666626,
+      "learning_rate": 3.342093031570668e-06,
+      "loss": 0.03170029640197754,
+      "step": 52200
+    },
+    {
+      "epoch": 1.8699274196431763,
+      "grad_norm": 1.4574371576309204,
+      "learning_rate": 3.2527083556795027e-06,
+      "loss": 0.03141381978988647,
+      "step": 52300
+    },
+    {
+      "epoch": 1.873502806678823,
+      "grad_norm": 0.3863239288330078,
+      "learning_rate": 3.1633236797883373e-06,
+      "loss": 0.031075146198272705,
+      "step": 52400
+    },
+    {
+      "epoch": 1.8770781937144696,
+      "grad_norm": 0.4181801676750183,
+      "learning_rate": 3.0739390038971723e-06,
+      "loss": 0.031000993251800536,
+      "step": 52500
+    },
+    {
+      "epoch": 1.8770781937144696,
+      "eval_accuracy": 0.9888013010709535,
+      "eval_f1": 0.9016717087789566,
+      "eval_loss": 0.04389448091387749,
+      "eval_precision": 0.8910285200988098,
+      "eval_recall": 0.9125722335623724,
+      "eval_runtime": 27.8666,
+      "eval_samples_per_second": 807.418,
+      "eval_steps_per_second": 22.428,
+      "step": 52500
+    },
+    {
+      "epoch": 1.8806535807501161,
+      "grad_norm": 0.2707064151763916,
+      "learning_rate": 2.984554328006007e-06,
+      "loss": 0.03185615539550781,
+      "step": 52600
+    },
+    {
+      "epoch": 1.8842289677857629,
+      "grad_norm": 0.5553069710731506,
+      "learning_rate": 2.895169652114842e-06,
+      "loss": 0.03002817392349243,
+      "step": 52700
+    },
+    {
+      "epoch": 1.8878043548214094,
+      "grad_norm": 0.3491911292076111,
+      "learning_rate": 2.8057849762236764e-06,
+      "loss": 0.028789632320404053,
+      "step": 52800
+    },
+    {
+      "epoch": 1.891379741857056,
+      "grad_norm": 0.25187739729881287,
+      "learning_rate": 2.716400300332511e-06,
+      "loss": 0.030605175495147706,
+      "step": 52900
+    },
+    {
+      "epoch": 1.8949551288927027,
+      "grad_norm": 0.9672222137451172,
+      "learning_rate": 2.627015624441346e-06,
+      "loss": 0.026704788208007812,
+      "step": 53000
+    },
+    {
+      "epoch": 1.8985305159283492,
+      "grad_norm": 0.20565390586853027,
+      "learning_rate": 2.5376309485501806e-06,
+      "loss": 0.03059121608734131,
+      "step": 53100
+    },
+    {
+      "epoch": 1.9021059029639957,
+      "grad_norm": 0.28167805075645447,
+      "learning_rate": 2.448246272659015e-06,
+      "loss": 0.03177599668502808,
+      "step": 53200
+    },
+    {
+      "epoch": 1.9056812899996425,
+      "grad_norm": 0.24386221170425415,
+      "learning_rate": 2.35886159676785e-06,
+      "loss": 0.029768753051757812,
+      "step": 53300
+    },
+    {
+      "epoch": 1.9092566770352892,
+      "grad_norm": 3.4795925617218018,
+      "learning_rate": 2.2694769208766848e-06,
+      "loss": 0.030632736682891844,
+      "step": 53400
+    },
+    {
+      "epoch": 1.9128320640709355,
+      "grad_norm": 0.28710371255874634,
+      "learning_rate": 2.1800922449855198e-06,
+      "loss": 0.03532270431518555,
+      "step": 53500
+    },
+    {
+      "epoch": 1.9164074511065823,
+      "grad_norm": 1.0009117126464844,
+      "learning_rate": 2.090707569094355e-06,
+      "loss": 0.030157883167266846,
+      "step": 53600
+    },
+    {
+      "epoch": 1.919982838142229,
+      "grad_norm": 0.8986654877662659,
+      "learning_rate": 2.0013228932031894e-06,
+      "loss": 0.02968831777572632,
+      "step": 53700
+    },
+    {
+      "epoch": 1.9235582251778756,
+      "grad_norm": 0.4408089518547058,
+      "learning_rate": 1.9119382173120244e-06,
+      "loss": 0.031650230884552,
+      "step": 53800
+    },
+    {
+      "epoch": 1.927133612213522,
+      "grad_norm": 0.44061407446861267,
+      "learning_rate": 1.822553541420859e-06,
+      "loss": 0.03314239501953125,
+      "step": 53900
+    },
+    {
+      "epoch": 1.9307089992491688,
+      "grad_norm": 0.31529247760772705,
+      "learning_rate": 1.7331688655296938e-06,
+      "loss": 0.028174445629119874,
+      "step": 54000
+    },
+    {
+      "epoch": 1.9342843862848154,
+      "grad_norm": 0.46949172019958496,
+      "learning_rate": 1.6437841896385283e-06,
+      "loss": 0.03205679178237915,
+      "step": 54100
+    },
+    {
+      "epoch": 1.9378597733204619,
+      "grad_norm": 0.42985737323760986,
+      "learning_rate": 1.5543995137473631e-06,
+      "loss": 0.03423054218292236,
+      "step": 54200
+    },
+    {
+      "epoch": 1.9414351603561086,
+      "grad_norm": 0.3582230806350708,
+      "learning_rate": 1.465014837856198e-06,
+      "loss": 0.036082537174224855,
+      "step": 54300
+    },
+    {
+      "epoch": 1.9450105473917552,
+      "grad_norm": 0.2743465304374695,
+      "learning_rate": 1.375630161965033e-06,
+      "loss": 0.03133800745010376,
+      "step": 54400
+    },
+    {
+      "epoch": 1.9485859344274017,
+      "grad_norm": 0.3252977728843689,
+      "learning_rate": 1.2862454860738675e-06,
+      "loss": 0.029351208209991455,
+      "step": 54500
+    },
+    {
+      "epoch": 1.9521613214630484,
+      "grad_norm": 0.7166300415992737,
+      "learning_rate": 1.1968608101827023e-06,
+      "loss": 0.03286364078521729,
+      "step": 54600
+    },
+    {
+      "epoch": 1.955736708498695,
+      "grad_norm": 0.4002815783023834,
+      "learning_rate": 1.1074761342915371e-06,
+      "loss": 0.03330163955688477,
+      "step": 54700
+    },
+    {
+      "epoch": 1.9593120955343415,
+      "grad_norm": 0.6636976003646851,
+      "learning_rate": 1.018091458400372e-06,
+      "loss": 0.03203016996383667,
+      "step": 54800
+    },
+    {
+      "epoch": 1.9628874825699882,
+      "grad_norm": 0.9583289623260498,
+      "learning_rate": 9.287067825092066e-07,
+      "loss": 0.03129979610443115,
+      "step": 54900
+    },
+    {
+      "epoch": 1.9664628696056348,
+      "grad_norm": 0.31978148221969604,
+      "learning_rate": 8.393221066180415e-07,
+      "loss": 0.029429452419281008,
+      "step": 55000
+    },
+    {
+      "epoch": 1.9664628696056348,
+      "eval_accuracy": 0.9891878983990663,
+      "eval_f1": 0.9045753492836575,
+      "eval_loss": 0.04267999157309532,
+      "eval_precision": 0.8949478748997595,
+      "eval_recall": 0.9144122128626053,
+      "eval_runtime": 27.5433,
+      "eval_samples_per_second": 816.897,
+      "eval_steps_per_second": 22.692,
+      "step": 55000
+    },
+    {
+      "epoch": 1.9700382566412813,
+      "grad_norm": 2.8054332733154297,
+      "learning_rate": 7.499374307268763e-07,
+      "loss": 0.03312858819961548,
+      "step": 55100
+    },
+    {
+      "epoch": 1.973613643676928,
+      "grad_norm": 0.5224851369857788,
+      "learning_rate": 6.60552754835711e-07,
+      "loss": 0.028790268898010254,
+      "step": 55200
+    },
+    {
+      "epoch": 1.9771890307125748,
+      "grad_norm": 0.26614582538604736,
+      "learning_rate": 5.711680789445458e-07,
+      "loss": 0.028711328506469725,
+      "step": 55300
+    },
+    {
+      "epoch": 1.980764417748221,
+      "grad_norm": 0.7065221667289734,
+      "learning_rate": 4.817834030533806e-07,
+      "loss": 0.03409520626068115,
+      "step": 55400
+    },
+    {
+      "epoch": 1.9843398047838678,
+      "grad_norm": 0.5520646572113037,
+      "learning_rate": 3.923987271622153e-07,
+      "loss": 0.030617287158966066,
+      "step": 55500
+    },
+    {
+      "epoch": 1.9879151918195146,
+      "grad_norm": 0.8152151703834534,
+      "learning_rate": 3.030140512710501e-07,
+      "loss": 0.034760825634002686,
+      "step": 55600
+    },
+    {
+      "epoch": 1.9914905788551611,
+      "grad_norm": 0.7719851136207581,
+      "learning_rate": 2.136293753798849e-07,
+      "loss": 0.033238520622253416,
+      "step": 55700
+    },
+    {
+      "epoch": 1.9950659658908076,
+      "grad_norm": 0.3627885580062866,
+      "learning_rate": 1.2424469948871967e-07,
+      "loss": 0.029695370197296143,
+      "step": 55800
+    },
+    {
+      "epoch": 1.9986413529264544,
+      "grad_norm": 1.9493422508239746,
+      "learning_rate": 3.4860023597554434e-08,
+      "loss": 0.032883105278015134,
+      "step": 55900
+    },
+    {
+      "epoch": 2.0,
+      "step": 55938,
+      "total_flos": 1.889848580814228e+18,
+      "train_loss": 0.058781605476671404,
+      "train_runtime": 18337.3113,
+      "train_samples_per_second": 439.268,
+      "train_steps_per_second": 3.051
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 55938,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 2500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.889848580814228e+18,
+  "train_batch_size": 72,
+  "trial_name": null,
+  "trial_params": null
+}