Image-Text-to-Text
PEFT
Safetensors
conversational
Tridis_HTR_MiniCPM / trainer_state.json
magistermilitum's picture
first load
77e5cef verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.4930747922437675,
"eval_steps": 3001,
"global_step": 18000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0013850415512465374,
"grad_norm": 18.132577896118164,
"learning_rate": 2.5688073394495415e-06,
"loss": 3.1562,
"step": 10
},
{
"epoch": 0.002770083102493075,
"grad_norm": 10.339975357055664,
"learning_rate": 6.238532110091744e-06,
"loss": 2.5596,
"step": 20
},
{
"epoch": 0.004155124653739612,
"grad_norm": 8.46363639831543,
"learning_rate": 9.908256880733946e-06,
"loss": 2.3108,
"step": 30
},
{
"epoch": 0.00554016620498615,
"grad_norm": 10.257675170898438,
"learning_rate": 1.3577981651376149e-05,
"loss": 2.1665,
"step": 40
},
{
"epoch": 0.006925207756232687,
"grad_norm": 10.614598274230957,
"learning_rate": 1.724770642201835e-05,
"loss": 1.9422,
"step": 50
},
{
"epoch": 0.008310249307479225,
"grad_norm": 10.544795989990234,
"learning_rate": 2.091743119266055e-05,
"loss": 1.7234,
"step": 60
},
{
"epoch": 0.009695290858725761,
"grad_norm": 9.736770629882812,
"learning_rate": 2.4587155963302752e-05,
"loss": 1.7008,
"step": 70
},
{
"epoch": 0.0110803324099723,
"grad_norm": 8.845556259155273,
"learning_rate": 2.8256880733944954e-05,
"loss": 1.6215,
"step": 80
},
{
"epoch": 0.012465373961218837,
"grad_norm": 12.72878360748291,
"learning_rate": 3.1926605504587156e-05,
"loss": 1.5234,
"step": 90
},
{
"epoch": 0.013850415512465374,
"grad_norm": 9.1851167678833,
"learning_rate": 3.559633027522936e-05,
"loss": 1.5045,
"step": 100
},
{
"epoch": 0.015235457063711912,
"grad_norm": 12.70969009399414,
"learning_rate": 3.926605504587156e-05,
"loss": 1.5202,
"step": 110
},
{
"epoch": 0.01662049861495845,
"grad_norm": 13.66281509399414,
"learning_rate": 3.998515150109044e-05,
"loss": 1.4786,
"step": 120
},
{
"epoch": 0.018005540166204988,
"grad_norm": 13.012584686279297,
"learning_rate": 3.996659087745348e-05,
"loss": 1.4217,
"step": 130
},
{
"epoch": 0.019390581717451522,
"grad_norm": 8.268105506896973,
"learning_rate": 3.994803025381653e-05,
"loss": 1.3395,
"step": 140
},
{
"epoch": 0.02077562326869806,
"grad_norm": 10.73968505859375,
"learning_rate": 3.9929469630179577e-05,
"loss": 1.3918,
"step": 150
},
{
"epoch": 0.0221606648199446,
"grad_norm": 15.68938159942627,
"learning_rate": 3.991090900654263e-05,
"loss": 1.3772,
"step": 160
},
{
"epoch": 0.023545706371191136,
"grad_norm": 8.91905689239502,
"learning_rate": 3.989234838290567e-05,
"loss": 1.3547,
"step": 170
},
{
"epoch": 0.024930747922437674,
"grad_norm": 8.650906562805176,
"learning_rate": 3.9873787759268715e-05,
"loss": 1.2924,
"step": 180
},
{
"epoch": 0.02631578947368421,
"grad_norm": 6.8491291999816895,
"learning_rate": 3.985522713563176e-05,
"loss": 1.2698,
"step": 190
},
{
"epoch": 0.027700831024930747,
"grad_norm": 12.829854011535645,
"learning_rate": 3.983666651199481e-05,
"loss": 1.2712,
"step": 200
},
{
"epoch": 0.029085872576177285,
"grad_norm": 7.840148448944092,
"learning_rate": 3.981810588835785e-05,
"loss": 1.2306,
"step": 210
},
{
"epoch": 0.030470914127423823,
"grad_norm": 9.273849487304688,
"learning_rate": 3.97995452647209e-05,
"loss": 1.1315,
"step": 220
},
{
"epoch": 0.03185595567867036,
"grad_norm": 6.926727771759033,
"learning_rate": 3.978098464108395e-05,
"loss": 1.2234,
"step": 230
},
{
"epoch": 0.0332409972299169,
"grad_norm": 8.453941345214844,
"learning_rate": 3.9762424017446984e-05,
"loss": 1.1558,
"step": 240
},
{
"epoch": 0.03462603878116344,
"grad_norm": 12.281208992004395,
"learning_rate": 3.9743863393810035e-05,
"loss": 1.2207,
"step": 250
},
{
"epoch": 0.036011080332409975,
"grad_norm": 8.216794967651367,
"learning_rate": 3.972530277017308e-05,
"loss": 1.1615,
"step": 260
},
{
"epoch": 0.037396121883656507,
"grad_norm": 8.111953735351562,
"learning_rate": 3.970674214653613e-05,
"loss": 1.0734,
"step": 270
},
{
"epoch": 0.038781163434903045,
"grad_norm": 25.132801055908203,
"learning_rate": 3.968818152289917e-05,
"loss": 1.1208,
"step": 280
},
{
"epoch": 0.04016620498614958,
"grad_norm": 8.570955276489258,
"learning_rate": 3.966962089926222e-05,
"loss": 1.1871,
"step": 290
},
{
"epoch": 0.04155124653739612,
"grad_norm": 6.715633869171143,
"learning_rate": 3.965106027562526e-05,
"loss": 1.0505,
"step": 300
},
{
"epoch": 0.04293628808864266,
"grad_norm": 9.195822715759277,
"learning_rate": 3.963249965198831e-05,
"loss": 1.132,
"step": 310
},
{
"epoch": 0.0443213296398892,
"grad_norm": 13.249361991882324,
"learning_rate": 3.9613939028351355e-05,
"loss": 1.1381,
"step": 320
},
{
"epoch": 0.045706371191135735,
"grad_norm": 10.22148323059082,
"learning_rate": 3.95953784047144e-05,
"loss": 1.03,
"step": 330
},
{
"epoch": 0.04709141274238227,
"grad_norm": 6.477139949798584,
"learning_rate": 3.957681778107745e-05,
"loss": 1.0697,
"step": 340
},
{
"epoch": 0.04847645429362881,
"grad_norm": 7.295064449310303,
"learning_rate": 3.9558257157440493e-05,
"loss": 1.1033,
"step": 350
},
{
"epoch": 0.04986149584487535,
"grad_norm": 7.153372287750244,
"learning_rate": 3.953969653380354e-05,
"loss": 1.0929,
"step": 360
},
{
"epoch": 0.05124653739612189,
"grad_norm": 7.3654608726501465,
"learning_rate": 3.952113591016658e-05,
"loss": 1.0833,
"step": 370
},
{
"epoch": 0.05263157894736842,
"grad_norm": 6.621361255645752,
"learning_rate": 3.950257528652963e-05,
"loss": 1.132,
"step": 380
},
{
"epoch": 0.054016620498614956,
"grad_norm": 8.671921730041504,
"learning_rate": 3.9484014662892675e-05,
"loss": 0.9377,
"step": 390
},
{
"epoch": 0.055401662049861494,
"grad_norm": 5.83281135559082,
"learning_rate": 3.9465454039255726e-05,
"loss": 1.077,
"step": 400
},
{
"epoch": 0.05678670360110803,
"grad_norm": 10.525577545166016,
"learning_rate": 3.944689341561877e-05,
"loss": 1.0157,
"step": 410
},
{
"epoch": 0.05817174515235457,
"grad_norm": 8.713621139526367,
"learning_rate": 3.9428332791981814e-05,
"loss": 0.9943,
"step": 420
},
{
"epoch": 0.05955678670360111,
"grad_norm": 7.146033763885498,
"learning_rate": 3.940977216834486e-05,
"loss": 1.0711,
"step": 430
},
{
"epoch": 0.060941828254847646,
"grad_norm": 5.721404075622559,
"learning_rate": 3.93912115447079e-05,
"loss": 1.0587,
"step": 440
},
{
"epoch": 0.062326869806094184,
"grad_norm": 6.381560802459717,
"learning_rate": 3.937265092107095e-05,
"loss": 0.9532,
"step": 450
},
{
"epoch": 0.06371191135734072,
"grad_norm": 6.270388603210449,
"learning_rate": 3.9354090297433996e-05,
"loss": 0.9633,
"step": 460
},
{
"epoch": 0.06509695290858726,
"grad_norm": 14.950815200805664,
"learning_rate": 3.9335529673797046e-05,
"loss": 1.0124,
"step": 470
},
{
"epoch": 0.0664819944598338,
"grad_norm": 10.147126197814941,
"learning_rate": 3.931696905016009e-05,
"loss": 0.9639,
"step": 480
},
{
"epoch": 0.06786703601108034,
"grad_norm": 6.738188743591309,
"learning_rate": 3.9298408426523134e-05,
"loss": 0.8883,
"step": 490
},
{
"epoch": 0.06925207756232687,
"grad_norm": 6.236662864685059,
"learning_rate": 3.927984780288618e-05,
"loss": 0.978,
"step": 500
},
{
"epoch": 0.07063711911357341,
"grad_norm": 6.446779251098633,
"learning_rate": 3.926128717924923e-05,
"loss": 0.9513,
"step": 510
},
{
"epoch": 0.07202216066481995,
"grad_norm": 5.965839385986328,
"learning_rate": 3.924272655561227e-05,
"loss": 0.9512,
"step": 520
},
{
"epoch": 0.07340720221606649,
"grad_norm": 6.4189863204956055,
"learning_rate": 3.9224165931975316e-05,
"loss": 0.9166,
"step": 530
},
{
"epoch": 0.07479224376731301,
"grad_norm": 6.521060943603516,
"learning_rate": 3.9205605308338366e-05,
"loss": 0.9101,
"step": 540
},
{
"epoch": 0.07617728531855955,
"grad_norm": 6.578867435455322,
"learning_rate": 3.918704468470141e-05,
"loss": 0.89,
"step": 550
},
{
"epoch": 0.07756232686980609,
"grad_norm": 5.1601080894470215,
"learning_rate": 3.9168484061064454e-05,
"loss": 0.9313,
"step": 560
},
{
"epoch": 0.07894736842105263,
"grad_norm": 6.995227336883545,
"learning_rate": 3.91499234374275e-05,
"loss": 0.8137,
"step": 570
},
{
"epoch": 0.08033240997229917,
"grad_norm": 8.542673110961914,
"learning_rate": 3.913136281379055e-05,
"loss": 0.843,
"step": 580
},
{
"epoch": 0.0817174515235457,
"grad_norm": 7.692107677459717,
"learning_rate": 3.911280219015359e-05,
"loss": 0.8752,
"step": 590
},
{
"epoch": 0.08310249307479224,
"grad_norm": 7.9079060554504395,
"learning_rate": 3.909424156651664e-05,
"loss": 0.9327,
"step": 600
},
{
"epoch": 0.08448753462603878,
"grad_norm": 5.869115829467773,
"learning_rate": 3.907568094287969e-05,
"loss": 0.878,
"step": 610
},
{
"epoch": 0.08587257617728532,
"grad_norm": 8.590213775634766,
"learning_rate": 3.905712031924273e-05,
"loss": 0.8299,
"step": 620
},
{
"epoch": 0.08725761772853186,
"grad_norm": 5.390839099884033,
"learning_rate": 3.9038559695605774e-05,
"loss": 0.8466,
"step": 630
},
{
"epoch": 0.0886426592797784,
"grad_norm": 9.3699312210083,
"learning_rate": 3.901999907196882e-05,
"loss": 0.8318,
"step": 640
},
{
"epoch": 0.09002770083102493,
"grad_norm": 4.842573642730713,
"learning_rate": 3.900143844833187e-05,
"loss": 0.8058,
"step": 650
},
{
"epoch": 0.09141274238227147,
"grad_norm": 6.638500690460205,
"learning_rate": 3.898287782469491e-05,
"loss": 0.925,
"step": 660
},
{
"epoch": 0.09279778393351801,
"grad_norm": 5.404094219207764,
"learning_rate": 3.8964317201057956e-05,
"loss": 0.9217,
"step": 670
},
{
"epoch": 0.09418282548476455,
"grad_norm": 6.114800453186035,
"learning_rate": 3.8945756577421e-05,
"loss": 0.8202,
"step": 680
},
{
"epoch": 0.09556786703601108,
"grad_norm": 9.157390594482422,
"learning_rate": 3.892719595378405e-05,
"loss": 0.8351,
"step": 690
},
{
"epoch": 0.09695290858725762,
"grad_norm": 4.4271697998046875,
"learning_rate": 3.8908635330147095e-05,
"loss": 0.8524,
"step": 700
},
{
"epoch": 0.09833795013850416,
"grad_norm": 6.291593551635742,
"learning_rate": 3.8890074706510145e-05,
"loss": 0.7766,
"step": 710
},
{
"epoch": 0.0997229916897507,
"grad_norm": 5.622344017028809,
"learning_rate": 3.887151408287319e-05,
"loss": 0.8793,
"step": 720
},
{
"epoch": 0.10110803324099724,
"grad_norm": 5.384952545166016,
"learning_rate": 3.885295345923623e-05,
"loss": 0.8128,
"step": 730
},
{
"epoch": 0.10249307479224377,
"grad_norm": 5.869991302490234,
"learning_rate": 3.8834392835599277e-05,
"loss": 0.7979,
"step": 740
},
{
"epoch": 0.1038781163434903,
"grad_norm": 15.408794403076172,
"learning_rate": 3.881583221196232e-05,
"loss": 0.8147,
"step": 750
},
{
"epoch": 0.10526315789473684,
"grad_norm": 3.851168394088745,
"learning_rate": 3.879727158832537e-05,
"loss": 0.8531,
"step": 760
},
{
"epoch": 0.10664819944598337,
"grad_norm": 7.8020124435424805,
"learning_rate": 3.8778710964688415e-05,
"loss": 0.762,
"step": 770
},
{
"epoch": 0.10803324099722991,
"grad_norm": 6.013789653778076,
"learning_rate": 3.8760150341051465e-05,
"loss": 0.9078,
"step": 780
},
{
"epoch": 0.10941828254847645,
"grad_norm": 10.16032600402832,
"learning_rate": 3.874158971741451e-05,
"loss": 0.7734,
"step": 790
},
{
"epoch": 0.11080332409972299,
"grad_norm": 5.860641956329346,
"learning_rate": 3.872302909377755e-05,
"loss": 0.7048,
"step": 800
},
{
"epoch": 0.11218836565096953,
"grad_norm": 5.945785045623779,
"learning_rate": 3.87044684701406e-05,
"loss": 0.7713,
"step": 810
},
{
"epoch": 0.11357340720221606,
"grad_norm": 7.668937683105469,
"learning_rate": 3.868590784650365e-05,
"loss": 0.8446,
"step": 820
},
{
"epoch": 0.1149584487534626,
"grad_norm": 4.74345064163208,
"learning_rate": 3.866734722286669e-05,
"loss": 0.7783,
"step": 830
},
{
"epoch": 0.11634349030470914,
"grad_norm": 6.158203601837158,
"learning_rate": 3.8648786599229735e-05,
"loss": 0.7757,
"step": 840
},
{
"epoch": 0.11772853185595568,
"grad_norm": 6.251650333404541,
"learning_rate": 3.8630225975592786e-05,
"loss": 0.7444,
"step": 850
},
{
"epoch": 0.11911357340720222,
"grad_norm": 6.265425682067871,
"learning_rate": 3.861166535195583e-05,
"loss": 0.7332,
"step": 860
},
{
"epoch": 0.12049861495844875,
"grad_norm": 6.330787658691406,
"learning_rate": 3.859310472831887e-05,
"loss": 0.8099,
"step": 870
},
{
"epoch": 0.12188365650969529,
"grad_norm": 5.528647422790527,
"learning_rate": 3.857454410468192e-05,
"loss": 0.7554,
"step": 880
},
{
"epoch": 0.12326869806094183,
"grad_norm": 4.549752712249756,
"learning_rate": 3.855598348104497e-05,
"loss": 0.8289,
"step": 890
},
{
"epoch": 0.12465373961218837,
"grad_norm": 6.695212364196777,
"learning_rate": 3.853742285740801e-05,
"loss": 0.8219,
"step": 900
},
{
"epoch": 0.1260387811634349,
"grad_norm": 8.377049446105957,
"learning_rate": 3.851886223377106e-05,
"loss": 0.7509,
"step": 910
},
{
"epoch": 0.12742382271468145,
"grad_norm": 3.6747989654541016,
"learning_rate": 3.8500301610134106e-05,
"loss": 0.7237,
"step": 920
},
{
"epoch": 0.12880886426592797,
"grad_norm": 5.396711826324463,
"learning_rate": 3.848174098649715e-05,
"loss": 0.7875,
"step": 930
},
{
"epoch": 0.13019390581717452,
"grad_norm": 5.941190719604492,
"learning_rate": 3.8463180362860193e-05,
"loss": 0.8429,
"step": 940
},
{
"epoch": 0.13157894736842105,
"grad_norm": 6.800902843475342,
"learning_rate": 3.844461973922324e-05,
"loss": 0.722,
"step": 950
},
{
"epoch": 0.1329639889196676,
"grad_norm": 5.411956787109375,
"learning_rate": 3.842605911558629e-05,
"loss": 0.6992,
"step": 960
},
{
"epoch": 0.13434903047091412,
"grad_norm": 5.536626815795898,
"learning_rate": 3.840749849194933e-05,
"loss": 0.7314,
"step": 970
},
{
"epoch": 0.13573407202216067,
"grad_norm": 6.304145812988281,
"learning_rate": 3.838893786831238e-05,
"loss": 0.7551,
"step": 980
},
{
"epoch": 0.1371191135734072,
"grad_norm": 5.245241641998291,
"learning_rate": 3.8370377244675426e-05,
"loss": 0.7268,
"step": 990
},
{
"epoch": 0.13850415512465375,
"grad_norm": 4.621682643890381,
"learning_rate": 3.835181662103847e-05,
"loss": 0.7104,
"step": 1000
},
{
"epoch": 0.13988919667590027,
"grad_norm": 4.966935634613037,
"learning_rate": 3.8333255997401514e-05,
"loss": 0.7683,
"step": 1010
},
{
"epoch": 0.14127423822714683,
"grad_norm": 5.239284992218018,
"learning_rate": 3.8314695373764564e-05,
"loss": 0.692,
"step": 1020
},
{
"epoch": 0.14265927977839335,
"grad_norm": 5.201038360595703,
"learning_rate": 3.829613475012761e-05,
"loss": 0.7262,
"step": 1030
},
{
"epoch": 0.1440443213296399,
"grad_norm": 5.287726879119873,
"learning_rate": 3.827757412649065e-05,
"loss": 0.7775,
"step": 1040
},
{
"epoch": 0.14542936288088643,
"grad_norm": 5.782080173492432,
"learning_rate": 3.8259013502853696e-05,
"loss": 0.68,
"step": 1050
},
{
"epoch": 0.14681440443213298,
"grad_norm": 4.6837239265441895,
"learning_rate": 3.824045287921674e-05,
"loss": 0.6769,
"step": 1060
},
{
"epoch": 0.1481994459833795,
"grad_norm": 4.747815132141113,
"learning_rate": 3.822189225557979e-05,
"loss": 0.7367,
"step": 1070
},
{
"epoch": 0.14958448753462603,
"grad_norm": 6.258734226226807,
"learning_rate": 3.8203331631942834e-05,
"loss": 0.7328,
"step": 1080
},
{
"epoch": 0.15096952908587258,
"grad_norm": 6.083340644836426,
"learning_rate": 3.8184771008305885e-05,
"loss": 0.7175,
"step": 1090
},
{
"epoch": 0.1523545706371191,
"grad_norm": 5.685976028442383,
"learning_rate": 3.816621038466893e-05,
"loss": 0.7737,
"step": 1100
},
{
"epoch": 0.15373961218836565,
"grad_norm": 5.695319175720215,
"learning_rate": 3.814764976103197e-05,
"loss": 0.6561,
"step": 1110
},
{
"epoch": 0.15512465373961218,
"grad_norm": 4.47851037979126,
"learning_rate": 3.8129089137395016e-05,
"loss": 0.7342,
"step": 1120
},
{
"epoch": 0.15650969529085873,
"grad_norm": 5.210443019866943,
"learning_rate": 3.8110528513758067e-05,
"loss": 0.761,
"step": 1130
},
{
"epoch": 0.15789473684210525,
"grad_norm": 4.6967549324035645,
"learning_rate": 3.809196789012111e-05,
"loss": 0.7034,
"step": 1140
},
{
"epoch": 0.1592797783933518,
"grad_norm": 5.27488899230957,
"learning_rate": 3.807340726648416e-05,
"loss": 0.7862,
"step": 1150
},
{
"epoch": 0.16066481994459833,
"grad_norm": 6.890642166137695,
"learning_rate": 3.8054846642847205e-05,
"loss": 0.6824,
"step": 1160
},
{
"epoch": 0.16204986149584488,
"grad_norm": 4.336601734161377,
"learning_rate": 3.803628601921025e-05,
"loss": 0.6637,
"step": 1170
},
{
"epoch": 0.1634349030470914,
"grad_norm": 7.999341011047363,
"learning_rate": 3.801772539557329e-05,
"loss": 0.6404,
"step": 1180
},
{
"epoch": 0.16481994459833796,
"grad_norm": 5.302224636077881,
"learning_rate": 3.7999164771936336e-05,
"loss": 0.7179,
"step": 1190
},
{
"epoch": 0.16620498614958448,
"grad_norm": 4.466766834259033,
"learning_rate": 3.798060414829939e-05,
"loss": 0.7234,
"step": 1200
},
{
"epoch": 0.16759002770083103,
"grad_norm": 7.038808345794678,
"learning_rate": 3.796204352466243e-05,
"loss": 0.6967,
"step": 1210
},
{
"epoch": 0.16897506925207756,
"grad_norm": 5.036026477813721,
"learning_rate": 3.794348290102548e-05,
"loss": 0.6566,
"step": 1220
},
{
"epoch": 0.1703601108033241,
"grad_norm": 3.9233360290527344,
"learning_rate": 3.7924922277388525e-05,
"loss": 0.6905,
"step": 1230
},
{
"epoch": 0.17174515235457063,
"grad_norm": 4.825267791748047,
"learning_rate": 3.790636165375157e-05,
"loss": 0.7097,
"step": 1240
},
{
"epoch": 0.1731301939058172,
"grad_norm": 5.921151638031006,
"learning_rate": 3.788780103011461e-05,
"loss": 0.7085,
"step": 1250
},
{
"epoch": 0.1745152354570637,
"grad_norm": 6.008309841156006,
"learning_rate": 3.7869240406477656e-05,
"loss": 0.7447,
"step": 1260
},
{
"epoch": 0.17590027700831026,
"grad_norm": 9.49063491821289,
"learning_rate": 3.785067978284071e-05,
"loss": 0.5783,
"step": 1270
},
{
"epoch": 0.1772853185595568,
"grad_norm": 8.459501266479492,
"learning_rate": 3.783211915920375e-05,
"loss": 0.6649,
"step": 1280
},
{
"epoch": 0.1786703601108033,
"grad_norm": 4.825805187225342,
"learning_rate": 3.78135585355668e-05,
"loss": 0.7529,
"step": 1290
},
{
"epoch": 0.18005540166204986,
"grad_norm": 5.760547161102295,
"learning_rate": 3.7794997911929845e-05,
"loss": 0.7133,
"step": 1300
},
{
"epoch": 0.1814404432132964,
"grad_norm": 4.8905158042907715,
"learning_rate": 3.777643728829289e-05,
"loss": 0.6426,
"step": 1310
},
{
"epoch": 0.18282548476454294,
"grad_norm": 3.2545783519744873,
"learning_rate": 3.775787666465593e-05,
"loss": 0.6756,
"step": 1320
},
{
"epoch": 0.18421052631578946,
"grad_norm": 5.443678855895996,
"learning_rate": 3.7739316041018983e-05,
"loss": 0.7079,
"step": 1330
},
{
"epoch": 0.18559556786703602,
"grad_norm": 4.882148742675781,
"learning_rate": 3.772075541738203e-05,
"loss": 0.6906,
"step": 1340
},
{
"epoch": 0.18698060941828254,
"grad_norm": 4.647951126098633,
"learning_rate": 3.770219479374508e-05,
"loss": 0.7146,
"step": 1350
},
{
"epoch": 0.1883656509695291,
"grad_norm": 4.474496841430664,
"learning_rate": 3.768363417010812e-05,
"loss": 0.6606,
"step": 1360
},
{
"epoch": 0.18975069252077562,
"grad_norm": 4.751135349273682,
"learning_rate": 3.7665073546471165e-05,
"loss": 0.6824,
"step": 1370
},
{
"epoch": 0.19113573407202217,
"grad_norm": 7.56292724609375,
"learning_rate": 3.764651292283421e-05,
"loss": 0.6646,
"step": 1380
},
{
"epoch": 0.1925207756232687,
"grad_norm": 3.877157688140869,
"learning_rate": 3.762795229919725e-05,
"loss": 0.6486,
"step": 1390
},
{
"epoch": 0.19390581717451524,
"grad_norm": 3.8777120113372803,
"learning_rate": 3.7609391675560304e-05,
"loss": 0.6452,
"step": 1400
},
{
"epoch": 0.19529085872576177,
"grad_norm": 11.175263404846191,
"learning_rate": 3.759083105192335e-05,
"loss": 0.6786,
"step": 1410
},
{
"epoch": 0.19667590027700832,
"grad_norm": 4.816016674041748,
"learning_rate": 3.75722704282864e-05,
"loss": 0.6549,
"step": 1420
},
{
"epoch": 0.19806094182825484,
"grad_norm": 4.739681720733643,
"learning_rate": 3.755370980464944e-05,
"loss": 0.7098,
"step": 1430
},
{
"epoch": 0.1994459833795014,
"grad_norm": 3.974384307861328,
"learning_rate": 3.7535149181012486e-05,
"loss": 0.6228,
"step": 1440
},
{
"epoch": 0.20083102493074792,
"grad_norm": 5.216143608093262,
"learning_rate": 3.751658855737553e-05,
"loss": 0.6322,
"step": 1450
},
{
"epoch": 0.20221606648199447,
"grad_norm": 4.246068954467773,
"learning_rate": 3.749802793373858e-05,
"loss": 0.6919,
"step": 1460
},
{
"epoch": 0.203601108033241,
"grad_norm": 5.880276203155518,
"learning_rate": 3.7479467310101624e-05,
"loss": 0.6664,
"step": 1470
},
{
"epoch": 0.20498614958448755,
"grad_norm": 4.310441493988037,
"learning_rate": 3.746090668646467e-05,
"loss": 0.666,
"step": 1480
},
{
"epoch": 0.20637119113573407,
"grad_norm": 4.8572893142700195,
"learning_rate": 3.744234606282771e-05,
"loss": 0.5718,
"step": 1490
},
{
"epoch": 0.2077562326869806,
"grad_norm": 4.827873229980469,
"learning_rate": 3.7423785439190755e-05,
"loss": 0.643,
"step": 1500
},
{
"epoch": 0.20914127423822715,
"grad_norm": 4.610620021820068,
"learning_rate": 3.7405224815553806e-05,
"loss": 0.6318,
"step": 1510
},
{
"epoch": 0.21052631578947367,
"grad_norm": 3.8572442531585693,
"learning_rate": 3.738666419191685e-05,
"loss": 0.6384,
"step": 1520
},
{
"epoch": 0.21191135734072022,
"grad_norm": 5.45397424697876,
"learning_rate": 3.73681035682799e-05,
"loss": 0.6112,
"step": 1530
},
{
"epoch": 0.21329639889196675,
"grad_norm": 4.176355361938477,
"learning_rate": 3.7349542944642944e-05,
"loss": 0.6084,
"step": 1540
},
{
"epoch": 0.2146814404432133,
"grad_norm": 4.451131343841553,
"learning_rate": 3.733098232100599e-05,
"loss": 0.6443,
"step": 1550
},
{
"epoch": 0.21606648199445982,
"grad_norm": 4.111794471740723,
"learning_rate": 3.731242169736903e-05,
"loss": 0.5832,
"step": 1560
},
{
"epoch": 0.21745152354570638,
"grad_norm": 5.375326156616211,
"learning_rate": 3.7293861073732076e-05,
"loss": 0.6261,
"step": 1570
},
{
"epoch": 0.2188365650969529,
"grad_norm": 3.982163429260254,
"learning_rate": 3.7275300450095126e-05,
"loss": 0.6045,
"step": 1580
},
{
"epoch": 0.22022160664819945,
"grad_norm": 4.093824863433838,
"learning_rate": 3.725859588882187e-05,
"loss": 0.6789,
"step": 1590
},
{
"epoch": 0.22160664819944598,
"grad_norm": 4.912144660949707,
"learning_rate": 3.724003526518491e-05,
"loss": 0.6261,
"step": 1600
},
{
"epoch": 0.22299168975069253,
"grad_norm": 7.267054080963135,
"learning_rate": 3.7221474641547957e-05,
"loss": 0.6439,
"step": 1610
},
{
"epoch": 0.22437673130193905,
"grad_norm": 7.550549507141113,
"learning_rate": 3.7202914017911e-05,
"loss": 0.6169,
"step": 1620
},
{
"epoch": 0.2257617728531856,
"grad_norm": 5.615678310394287,
"learning_rate": 3.718435339427405e-05,
"loss": 0.6302,
"step": 1630
},
{
"epoch": 0.22714681440443213,
"grad_norm": 5.005547046661377,
"learning_rate": 3.7165792770637095e-05,
"loss": 0.6026,
"step": 1640
},
{
"epoch": 0.22853185595567868,
"grad_norm": 4.575570106506348,
"learning_rate": 3.7147232147000145e-05,
"loss": 0.5985,
"step": 1650
},
{
"epoch": 0.2299168975069252,
"grad_norm": 4.003867149353027,
"learning_rate": 3.712867152336319e-05,
"loss": 0.6355,
"step": 1660
},
{
"epoch": 0.23130193905817176,
"grad_norm": 4.371513843536377,
"learning_rate": 3.711011089972623e-05,
"loss": 0.6858,
"step": 1670
},
{
"epoch": 0.23268698060941828,
"grad_norm": 4.9826765060424805,
"learning_rate": 3.709155027608928e-05,
"loss": 0.5826,
"step": 1680
},
{
"epoch": 0.23407202216066483,
"grad_norm": 5.753824710845947,
"learning_rate": 3.707298965245233e-05,
"loss": 0.6877,
"step": 1690
},
{
"epoch": 0.23545706371191136,
"grad_norm": 5.006950378417969,
"learning_rate": 3.705442902881537e-05,
"loss": 0.6843,
"step": 1700
},
{
"epoch": 0.23684210526315788,
"grad_norm": 4.370176792144775,
"learning_rate": 3.703586840517842e-05,
"loss": 0.6099,
"step": 1710
},
{
"epoch": 0.23822714681440443,
"grad_norm": 3.8350753784179688,
"learning_rate": 3.7017307781541466e-05,
"loss": 0.6045,
"step": 1720
},
{
"epoch": 0.23961218836565096,
"grad_norm": 4.00664758682251,
"learning_rate": 3.699874715790451e-05,
"loss": 0.6468,
"step": 1730
},
{
"epoch": 0.2409972299168975,
"grad_norm": 4.4113030433654785,
"learning_rate": 3.698018653426755e-05,
"loss": 0.6667,
"step": 1740
},
{
"epoch": 0.24238227146814403,
"grad_norm": 6.2188639640808105,
"learning_rate": 3.69616259106306e-05,
"loss": 0.5793,
"step": 1750
},
{
"epoch": 0.24376731301939059,
"grad_norm": 5.410155773162842,
"learning_rate": 3.694306528699365e-05,
"loss": 0.5879,
"step": 1760
},
{
"epoch": 0.2451523545706371,
"grad_norm": 4.4489521980285645,
"learning_rate": 3.692450466335669e-05,
"loss": 0.6499,
"step": 1770
},
{
"epoch": 0.24653739612188366,
"grad_norm": 5.712794303894043,
"learning_rate": 3.690594403971974e-05,
"loss": 0.6009,
"step": 1780
},
{
"epoch": 0.24792243767313019,
"grad_norm": 4.065003395080566,
"learning_rate": 3.6887383416082786e-05,
"loss": 0.6676,
"step": 1790
},
{
"epoch": 0.24930747922437674,
"grad_norm": 4.225168228149414,
"learning_rate": 3.686882279244583e-05,
"loss": 0.6011,
"step": 1800
},
{
"epoch": 0.25069252077562326,
"grad_norm": 3.180039167404175,
"learning_rate": 3.6850262168808873e-05,
"loss": 0.5549,
"step": 1810
},
{
"epoch": 0.2520775623268698,
"grad_norm": 3.799586772918701,
"learning_rate": 3.6831701545171924e-05,
"loss": 0.6039,
"step": 1820
},
{
"epoch": 0.25346260387811637,
"grad_norm": 4.4404425621032715,
"learning_rate": 3.681314092153497e-05,
"loss": 0.5592,
"step": 1830
},
{
"epoch": 0.2548476454293629,
"grad_norm": 5.383872985839844,
"learning_rate": 3.679458029789801e-05,
"loss": 0.6932,
"step": 1840
},
{
"epoch": 0.2562326869806094,
"grad_norm": 4.271496295928955,
"learning_rate": 3.6776019674261055e-05,
"loss": 0.5506,
"step": 1850
},
{
"epoch": 0.25761772853185594,
"grad_norm": 4.5665202140808105,
"learning_rate": 3.67574590506241e-05,
"loss": 0.6214,
"step": 1860
},
{
"epoch": 0.2590027700831025,
"grad_norm": 5.333662986755371,
"learning_rate": 3.673889842698715e-05,
"loss": 0.6171,
"step": 1870
},
{
"epoch": 0.26038781163434904,
"grad_norm": 3.2647597789764404,
"learning_rate": 3.6720337803350194e-05,
"loss": 0.5626,
"step": 1880
},
{
"epoch": 0.26177285318559557,
"grad_norm": 11.73008918762207,
"learning_rate": 3.6701777179713244e-05,
"loss": 0.5816,
"step": 1890
},
{
"epoch": 0.2631578947368421,
"grad_norm": 4.770632266998291,
"learning_rate": 3.668321655607629e-05,
"loss": 0.5818,
"step": 1900
},
{
"epoch": 0.26454293628808867,
"grad_norm": 3.688140869140625,
"learning_rate": 3.666465593243933e-05,
"loss": 0.5898,
"step": 1910
},
{
"epoch": 0.2659279778393352,
"grad_norm": 6.2547736167907715,
"learning_rate": 3.6646095308802376e-05,
"loss": 0.6211,
"step": 1920
},
{
"epoch": 0.2673130193905817,
"grad_norm": 5.1031389236450195,
"learning_rate": 3.6627534685165426e-05,
"loss": 0.6125,
"step": 1930
},
{
"epoch": 0.26869806094182824,
"grad_norm": 4.455575942993164,
"learning_rate": 3.660897406152847e-05,
"loss": 0.5901,
"step": 1940
},
{
"epoch": 0.27008310249307477,
"grad_norm": 5.132863998413086,
"learning_rate": 3.6590413437891514e-05,
"loss": 0.582,
"step": 1950
},
{
"epoch": 0.27146814404432135,
"grad_norm": 3.6740236282348633,
"learning_rate": 3.6571852814254565e-05,
"loss": 0.5668,
"step": 1960
},
{
"epoch": 0.27285318559556787,
"grad_norm": 4.9556989669799805,
"learning_rate": 3.655329219061761e-05,
"loss": 0.497,
"step": 1970
},
{
"epoch": 0.2742382271468144,
"grad_norm": 5.816377639770508,
"learning_rate": 3.653473156698065e-05,
"loss": 0.5759,
"step": 1980
},
{
"epoch": 0.2756232686980609,
"grad_norm": 11.042893409729004,
"learning_rate": 3.6516170943343696e-05,
"loss": 0.5747,
"step": 1990
},
{
"epoch": 0.2770083102493075,
"grad_norm": 5.5561699867248535,
"learning_rate": 3.6497610319706747e-05,
"loss": 0.6001,
"step": 2000
},
{
"epoch": 0.278393351800554,
"grad_norm": 3.443624496459961,
"learning_rate": 3.647904969606979e-05,
"loss": 0.5558,
"step": 2010
},
{
"epoch": 0.27977839335180055,
"grad_norm": 3.77988862991333,
"learning_rate": 3.646048907243284e-05,
"loss": 0.4779,
"step": 2020
},
{
"epoch": 0.28116343490304707,
"grad_norm": 4.237990379333496,
"learning_rate": 3.6441928448795885e-05,
"loss": 0.5643,
"step": 2030
},
{
"epoch": 0.28254847645429365,
"grad_norm": 5.618165016174316,
"learning_rate": 3.642336782515893e-05,
"loss": 0.6431,
"step": 2040
},
{
"epoch": 0.2839335180055402,
"grad_norm": 4.02553129196167,
"learning_rate": 3.640480720152197e-05,
"loss": 0.5379,
"step": 2050
},
{
"epoch": 0.2853185595567867,
"grad_norm": 4.569252967834473,
"learning_rate": 3.6386246577885016e-05,
"loss": 0.5608,
"step": 2060
},
{
"epoch": 0.2867036011080332,
"grad_norm": 4.142032146453857,
"learning_rate": 3.636768595424807e-05,
"loss": 0.572,
"step": 2070
},
{
"epoch": 0.2880886426592798,
"grad_norm": 6.030134677886963,
"learning_rate": 3.634912533061111e-05,
"loss": 0.6114,
"step": 2080
},
{
"epoch": 0.2894736842105263,
"grad_norm": 5.247105598449707,
"learning_rate": 3.633056470697416e-05,
"loss": 0.5943,
"step": 2090
},
{
"epoch": 0.29085872576177285,
"grad_norm": 3.4325268268585205,
"learning_rate": 3.6312004083337205e-05,
"loss": 0.5745,
"step": 2100
},
{
"epoch": 0.2922437673130194,
"grad_norm": 3.9712328910827637,
"learning_rate": 3.629344345970025e-05,
"loss": 0.5947,
"step": 2110
},
{
"epoch": 0.29362880886426596,
"grad_norm": 4.028194427490234,
"learning_rate": 3.627488283606329e-05,
"loss": 0.5505,
"step": 2120
},
{
"epoch": 0.2950138504155125,
"grad_norm": 3.4895143508911133,
"learning_rate": 3.625632221242634e-05,
"loss": 0.5593,
"step": 2130
},
{
"epoch": 0.296398891966759,
"grad_norm": 3.2017526626586914,
"learning_rate": 3.623776158878939e-05,
"loss": 0.5276,
"step": 2140
},
{
"epoch": 0.29778393351800553,
"grad_norm": 3.9487078189849854,
"learning_rate": 3.621920096515243e-05,
"loss": 0.5495,
"step": 2150
},
{
"epoch": 0.29916897506925205,
"grad_norm": 4.558566570281982,
"learning_rate": 3.620064034151548e-05,
"loss": 0.6063,
"step": 2160
},
{
"epoch": 0.30055401662049863,
"grad_norm": 5.271570682525635,
"learning_rate": 3.6182079717878525e-05,
"loss": 0.5808,
"step": 2170
},
{
"epoch": 0.30193905817174516,
"grad_norm": 4.145854949951172,
"learning_rate": 3.616351909424157e-05,
"loss": 0.5319,
"step": 2180
},
{
"epoch": 0.3033240997229917,
"grad_norm": 5.569437503814697,
"learning_rate": 3.614495847060461e-05,
"loss": 0.5293,
"step": 2190
},
{
"epoch": 0.3047091412742382,
"grad_norm": 5.583808422088623,
"learning_rate": 3.6126397846967663e-05,
"loss": 0.5285,
"step": 2200
},
{
"epoch": 0.3060941828254848,
"grad_norm": 5.36161470413208,
"learning_rate": 3.610783722333071e-05,
"loss": 0.5507,
"step": 2210
},
{
"epoch": 0.3074792243767313,
"grad_norm": 4.53624963760376,
"learning_rate": 3.608927659969375e-05,
"loss": 0.529,
"step": 2220
},
{
"epoch": 0.30886426592797783,
"grad_norm": 5.9227705001831055,
"learning_rate": 3.6070715976056795e-05,
"loss": 0.4959,
"step": 2230
},
{
"epoch": 0.31024930747922436,
"grad_norm": 3.500408887863159,
"learning_rate": 3.6052155352419845e-05,
"loss": 0.5269,
"step": 2240
},
{
"epoch": 0.31163434903047094,
"grad_norm": 3.2887611389160156,
"learning_rate": 3.603359472878289e-05,
"loss": 0.5743,
"step": 2250
},
{
"epoch": 0.31301939058171746,
"grad_norm": 4.520106792449951,
"learning_rate": 3.601503410514593e-05,
"loss": 0.5698,
"step": 2260
},
{
"epoch": 0.314404432132964,
"grad_norm": 4.752847671508789,
"learning_rate": 3.5996473481508984e-05,
"loss": 0.6082,
"step": 2270
},
{
"epoch": 0.3157894736842105,
"grad_norm": 3.0846593379974365,
"learning_rate": 3.597791285787203e-05,
"loss": 0.5231,
"step": 2280
},
{
"epoch": 0.3171745152354571,
"grad_norm": 3.945099115371704,
"learning_rate": 3.595935223423507e-05,
"loss": 0.552,
"step": 2290
},
{
"epoch": 0.3185595567867036,
"grad_norm": 3.7080931663513184,
"learning_rate": 3.5940791610598115e-05,
"loss": 0.5954,
"step": 2300
},
{
"epoch": 0.31994459833795014,
"grad_norm": 3.6161038875579834,
"learning_rate": 3.5922230986961166e-05,
"loss": 0.5742,
"step": 2310
},
{
"epoch": 0.32132963988919666,
"grad_norm": 3.662464141845703,
"learning_rate": 3.590367036332421e-05,
"loss": 0.5134,
"step": 2320
},
{
"epoch": 0.32271468144044324,
"grad_norm": 5.78975248336792,
"learning_rate": 3.588510973968726e-05,
"loss": 0.6286,
"step": 2330
},
{
"epoch": 0.32409972299168976,
"grad_norm": 4.353717803955078,
"learning_rate": 3.5866549116050304e-05,
"loss": 0.5182,
"step": 2340
},
{
"epoch": 0.3254847645429363,
"grad_norm": 3.810673952102661,
"learning_rate": 3.584798849241335e-05,
"loss": 0.4705,
"step": 2350
},
{
"epoch": 0.3268698060941828,
"grad_norm": 3.3044183254241943,
"learning_rate": 3.582942786877639e-05,
"loss": 0.5736,
"step": 2360
},
{
"epoch": 0.32825484764542934,
"grad_norm": 5.248436450958252,
"learning_rate": 3.5810867245139435e-05,
"loss": 0.5238,
"step": 2370
},
{
"epoch": 0.3296398891966759,
"grad_norm": 4.443650245666504,
"learning_rate": 3.5792306621502486e-05,
"loss": 0.5366,
"step": 2380
},
{
"epoch": 0.33102493074792244,
"grad_norm": 4.204538345336914,
"learning_rate": 3.577374599786553e-05,
"loss": 0.4978,
"step": 2390
},
{
"epoch": 0.33240997229916897,
"grad_norm": 3.603652000427246,
"learning_rate": 3.575518537422858e-05,
"loss": 0.5319,
"step": 2400
},
{
"epoch": 0.3337950138504155,
"grad_norm": 4.066627025604248,
"learning_rate": 3.5736624750591624e-05,
"loss": 0.4795,
"step": 2410
},
{
"epoch": 0.33518005540166207,
"grad_norm": 7.520932197570801,
"learning_rate": 3.571806412695467e-05,
"loss": 0.4946,
"step": 2420
},
{
"epoch": 0.3365650969529086,
"grad_norm": 4.000723361968994,
"learning_rate": 3.569950350331771e-05,
"loss": 0.5143,
"step": 2430
},
{
"epoch": 0.3379501385041551,
"grad_norm": 3.362517833709717,
"learning_rate": 3.568094287968076e-05,
"loss": 0.5311,
"step": 2440
},
{
"epoch": 0.33933518005540164,
"grad_norm": 4.851598262786865,
"learning_rate": 3.5662382256043806e-05,
"loss": 0.503,
"step": 2450
},
{
"epoch": 0.3407202216066482,
"grad_norm": 2.6380112171173096,
"learning_rate": 3.564382163240685e-05,
"loss": 0.5267,
"step": 2460
},
{
"epoch": 0.34210526315789475,
"grad_norm": 4.395262241363525,
"learning_rate": 3.56252610087699e-05,
"loss": 0.5091,
"step": 2470
},
{
"epoch": 0.34349030470914127,
"grad_norm": 5.229985237121582,
"learning_rate": 3.5606700385132944e-05,
"loss": 0.5397,
"step": 2480
},
{
"epoch": 0.3448753462603878,
"grad_norm": 3.8562421798706055,
"learning_rate": 3.558813976149599e-05,
"loss": 0.5188,
"step": 2490
},
{
"epoch": 0.3462603878116344,
"grad_norm": 11.592336654663086,
"learning_rate": 3.556957913785903e-05,
"loss": 0.5095,
"step": 2500
},
{
"epoch": 0.3476454293628809,
"grad_norm": 3.571937322616577,
"learning_rate": 3.555101851422208e-05,
"loss": 0.6358,
"step": 2510
},
{
"epoch": 0.3490304709141274,
"grad_norm": 3.7377965450286865,
"learning_rate": 3.5532457890585126e-05,
"loss": 0.568,
"step": 2520
},
{
"epoch": 0.35041551246537395,
"grad_norm": 3.339667320251465,
"learning_rate": 3.551389726694818e-05,
"loss": 0.4745,
"step": 2530
},
{
"epoch": 0.3518005540166205,
"grad_norm": 4.469910621643066,
"learning_rate": 3.549533664331122e-05,
"loss": 0.5272,
"step": 2540
},
{
"epoch": 0.35318559556786705,
"grad_norm": 3.5588207244873047,
"learning_rate": 3.5476776019674265e-05,
"loss": 0.5189,
"step": 2550
},
{
"epoch": 0.3545706371191136,
"grad_norm": 4.583835124969482,
"learning_rate": 3.545821539603731e-05,
"loss": 0.6198,
"step": 2560
},
{
"epoch": 0.3559556786703601,
"grad_norm": 4.148881912231445,
"learning_rate": 3.543965477240035e-05,
"loss": 0.5521,
"step": 2570
},
{
"epoch": 0.3573407202216066,
"grad_norm": 11.703550338745117,
"learning_rate": 3.54210941487634e-05,
"loss": 0.4996,
"step": 2580
},
{
"epoch": 0.3587257617728532,
"grad_norm": 3.770169734954834,
"learning_rate": 3.5402533525126447e-05,
"loss": 0.5395,
"step": 2590
},
{
"epoch": 0.3601108033240997,
"grad_norm": 4.923569679260254,
"learning_rate": 3.53839729014895e-05,
"loss": 0.4904,
"step": 2600
},
{
"epoch": 0.36149584487534625,
"grad_norm": 2.8726134300231934,
"learning_rate": 3.536541227785254e-05,
"loss": 0.5093,
"step": 2610
},
{
"epoch": 0.3628808864265928,
"grad_norm": 2.879404306411743,
"learning_rate": 3.5346851654215585e-05,
"loss": 0.5385,
"step": 2620
},
{
"epoch": 0.36426592797783935,
"grad_norm": 2.7815260887145996,
"learning_rate": 3.532829103057863e-05,
"loss": 0.5266,
"step": 2630
},
{
"epoch": 0.3656509695290859,
"grad_norm": 3.5711984634399414,
"learning_rate": 3.530973040694168e-05,
"loss": 0.5257,
"step": 2640
},
{
"epoch": 0.3670360110803324,
"grad_norm": 2.760270118713379,
"learning_rate": 3.529116978330472e-05,
"loss": 0.5074,
"step": 2650
},
{
"epoch": 0.3684210526315789,
"grad_norm": 2.6999144554138184,
"learning_rate": 3.527260915966777e-05,
"loss": 0.5,
"step": 2660
},
{
"epoch": 0.3698060941828255,
"grad_norm": 3.2955033779144287,
"learning_rate": 3.525404853603081e-05,
"loss": 0.478,
"step": 2670
},
{
"epoch": 0.37119113573407203,
"grad_norm": 6.846369743347168,
"learning_rate": 3.5235487912393854e-05,
"loss": 0.5314,
"step": 2680
},
{
"epoch": 0.37257617728531855,
"grad_norm": 3.0586764812469482,
"learning_rate": 3.5216927288756905e-05,
"loss": 0.4788,
"step": 2690
},
{
"epoch": 0.3739612188365651,
"grad_norm": 5.029668807983398,
"learning_rate": 3.519836666511995e-05,
"loss": 0.4966,
"step": 2700
},
{
"epoch": 0.37534626038781166,
"grad_norm": 6.83918571472168,
"learning_rate": 3.5179806041483e-05,
"loss": 0.5342,
"step": 2710
},
{
"epoch": 0.3767313019390582,
"grad_norm": 3.3956825733184814,
"learning_rate": 3.516124541784604e-05,
"loss": 0.5528,
"step": 2720
},
{
"epoch": 0.3781163434903047,
"grad_norm": 3.8964269161224365,
"learning_rate": 3.514268479420909e-05,
"loss": 0.4569,
"step": 2730
},
{
"epoch": 0.37950138504155123,
"grad_norm": 3.9531123638153076,
"learning_rate": 3.512412417057213e-05,
"loss": 0.4998,
"step": 2740
},
{
"epoch": 0.3808864265927978,
"grad_norm": 4.969144344329834,
"learning_rate": 3.510556354693518e-05,
"loss": 0.4915,
"step": 2750
},
{
"epoch": 0.38227146814404434,
"grad_norm": 7.940041542053223,
"learning_rate": 3.5087002923298225e-05,
"loss": 0.5062,
"step": 2760
},
{
"epoch": 0.38365650969529086,
"grad_norm": 4.755401134490967,
"learning_rate": 3.506844229966127e-05,
"loss": 0.4752,
"step": 2770
},
{
"epoch": 0.3850415512465374,
"grad_norm": 6.793601989746094,
"learning_rate": 3.504988167602432e-05,
"loss": 0.449,
"step": 2780
},
{
"epoch": 0.3864265927977839,
"grad_norm": 3.899365186691284,
"learning_rate": 3.5031321052387363e-05,
"loss": 0.4956,
"step": 2790
},
{
"epoch": 0.3878116343490305,
"grad_norm": 4.510800838470459,
"learning_rate": 3.501276042875041e-05,
"loss": 0.5161,
"step": 2800
},
{
"epoch": 0.389196675900277,
"grad_norm": 3.8182857036590576,
"learning_rate": 3.499419980511345e-05,
"loss": 0.5209,
"step": 2810
},
{
"epoch": 0.39058171745152354,
"grad_norm": 4.2780256271362305,
"learning_rate": 3.49756391814765e-05,
"loss": 0.5402,
"step": 2820
},
{
"epoch": 0.39196675900277006,
"grad_norm": 3.448150157928467,
"learning_rate": 3.4957078557839545e-05,
"loss": 0.504,
"step": 2830
},
{
"epoch": 0.39335180055401664,
"grad_norm": 4.190673828125,
"learning_rate": 3.4938517934202596e-05,
"loss": 0.5799,
"step": 2840
},
{
"epoch": 0.39473684210526316,
"grad_norm": 4.70761775970459,
"learning_rate": 3.491995731056564e-05,
"loss": 0.4654,
"step": 2850
},
{
"epoch": 0.3961218836565097,
"grad_norm": 4.735783576965332,
"learning_rate": 3.4901396686928684e-05,
"loss": 0.4908,
"step": 2860
},
{
"epoch": 0.3975069252077562,
"grad_norm": 6.540755271911621,
"learning_rate": 3.488283606329173e-05,
"loss": 0.4903,
"step": 2870
},
{
"epoch": 0.3988919667590028,
"grad_norm": 4.559645175933838,
"learning_rate": 3.486427543965477e-05,
"loss": 0.5602,
"step": 2880
},
{
"epoch": 0.4002770083102493,
"grad_norm": 4.629283428192139,
"learning_rate": 3.484571481601782e-05,
"loss": 0.5174,
"step": 2890
},
{
"epoch": 0.40166204986149584,
"grad_norm": 3.9410197734832764,
"learning_rate": 3.4827154192380866e-05,
"loss": 0.5041,
"step": 2900
},
{
"epoch": 0.40304709141274236,
"grad_norm": 4.21798849105835,
"learning_rate": 3.481044963110761e-05,
"loss": 0.5106,
"step": 2910
},
{
"epoch": 0.40443213296398894,
"grad_norm": 4.3796257972717285,
"learning_rate": 3.479188900747065e-05,
"loss": 0.4752,
"step": 2920
},
{
"epoch": 0.40581717451523547,
"grad_norm": 3.2095866203308105,
"learning_rate": 3.4773328383833696e-05,
"loss": 0.4748,
"step": 2930
},
{
"epoch": 0.407202216066482,
"grad_norm": 6.470020771026611,
"learning_rate": 3.475476776019675e-05,
"loss": 0.5938,
"step": 2940
},
{
"epoch": 0.4085872576177285,
"grad_norm": 4.654620170593262,
"learning_rate": 3.473620713655979e-05,
"loss": 0.5489,
"step": 2950
},
{
"epoch": 0.4099722991689751,
"grad_norm": 3.7245383262634277,
"learning_rate": 3.471764651292284e-05,
"loss": 0.4662,
"step": 2960
},
{
"epoch": 0.4113573407202216,
"grad_norm": 4.06242036819458,
"learning_rate": 3.4699085889285885e-05,
"loss": 0.4416,
"step": 2970
},
{
"epoch": 0.41274238227146814,
"grad_norm": 3.3347277641296387,
"learning_rate": 3.468052526564893e-05,
"loss": 0.5031,
"step": 2980
},
{
"epoch": 0.41412742382271467,
"grad_norm": 4.123641490936279,
"learning_rate": 3.466196464201197e-05,
"loss": 0.5363,
"step": 2990
},
{
"epoch": 0.4155124653739612,
"grad_norm": 4.5999016761779785,
"learning_rate": 3.464340401837502e-05,
"loss": 0.4584,
"step": 3000
},
{
"epoch": 0.41565096952908587,
"eval_loss": 0.48571890592575073,
"eval_runtime": 1418.1173,
"eval_samples_per_second": 6.431,
"eval_steps_per_second": 0.804,
"step": 3001
},
{
"epoch": 0.4168975069252078,
"grad_norm": 5.3403496742248535,
"learning_rate": 3.462484339473807e-05,
"loss": 0.5159,
"step": 3010
},
{
"epoch": 0.4182825484764543,
"grad_norm": 4.382096290588379,
"learning_rate": 3.46081388334648e-05,
"loss": 0.5077,
"step": 3020
},
{
"epoch": 0.4196675900277008,
"grad_norm": 3.537562131881714,
"learning_rate": 3.4589578209827854e-05,
"loss": 0.4805,
"step": 3030
},
{
"epoch": 0.42105263157894735,
"grad_norm": 2.426811456680298,
"learning_rate": 3.45710175861909e-05,
"loss": 0.4675,
"step": 3040
},
{
"epoch": 0.4224376731301939,
"grad_norm": 4.311849117279053,
"learning_rate": 3.455245696255395e-05,
"loss": 0.5579,
"step": 3050
},
{
"epoch": 0.42382271468144045,
"grad_norm": 3.4808189868927,
"learning_rate": 3.453389633891699e-05,
"loss": 0.4591,
"step": 3060
},
{
"epoch": 0.425207756232687,
"grad_norm": 4.2031049728393555,
"learning_rate": 3.4515335715280036e-05,
"loss": 0.5279,
"step": 3070
},
{
"epoch": 0.4265927977839335,
"grad_norm": 4.175496578216553,
"learning_rate": 3.449677509164308e-05,
"loss": 0.4731,
"step": 3080
},
{
"epoch": 0.4279778393351801,
"grad_norm": 3.436108350753784,
"learning_rate": 3.447821446800612e-05,
"loss": 0.4754,
"step": 3090
},
{
"epoch": 0.4293628808864266,
"grad_norm": 6.296165943145752,
"learning_rate": 3.4459653844369174e-05,
"loss": 0.4821,
"step": 3100
},
{
"epoch": 0.4307479224376731,
"grad_norm": 3.948596239089966,
"learning_rate": 3.444109322073222e-05,
"loss": 0.5217,
"step": 3110
},
{
"epoch": 0.43213296398891965,
"grad_norm": 3.046654224395752,
"learning_rate": 3.442253259709527e-05,
"loss": 0.4889,
"step": 3120
},
{
"epoch": 0.43351800554016623,
"grad_norm": 2.8600590229034424,
"learning_rate": 3.440397197345831e-05,
"loss": 0.4593,
"step": 3130
},
{
"epoch": 0.43490304709141275,
"grad_norm": 3.5031280517578125,
"learning_rate": 3.4385411349821356e-05,
"loss": 0.4989,
"step": 3140
},
{
"epoch": 0.4362880886426593,
"grad_norm": 4.562175750732422,
"learning_rate": 3.43668507261844e-05,
"loss": 0.5269,
"step": 3150
},
{
"epoch": 0.4376731301939058,
"grad_norm": 3.6332297325134277,
"learning_rate": 3.434829010254745e-05,
"loss": 0.4136,
"step": 3160
},
{
"epoch": 0.4390581717451524,
"grad_norm": 3.7745444774627686,
"learning_rate": 3.4329729478910494e-05,
"loss": 0.4953,
"step": 3170
},
{
"epoch": 0.4404432132963989,
"grad_norm": 3.368767499923706,
"learning_rate": 3.431116885527354e-05,
"loss": 0.4436,
"step": 3180
},
{
"epoch": 0.44182825484764543,
"grad_norm": 3.215421676635742,
"learning_rate": 3.429260823163659e-05,
"loss": 0.5105,
"step": 3190
},
{
"epoch": 0.44321329639889195,
"grad_norm": 4.738856315612793,
"learning_rate": 3.427404760799963e-05,
"loss": 0.5586,
"step": 3200
},
{
"epoch": 0.4445983379501385,
"grad_norm": 10.23610782623291,
"learning_rate": 3.4255486984362676e-05,
"loss": 0.5084,
"step": 3210
},
{
"epoch": 0.44598337950138506,
"grad_norm": 6.943126678466797,
"learning_rate": 3.423692636072572e-05,
"loss": 0.4687,
"step": 3220
},
{
"epoch": 0.4473684210526316,
"grad_norm": 3.2041103839874268,
"learning_rate": 3.421836573708877e-05,
"loss": 0.4768,
"step": 3230
},
{
"epoch": 0.4487534626038781,
"grad_norm": 7.946971893310547,
"learning_rate": 3.4199805113451814e-05,
"loss": 0.5148,
"step": 3240
},
{
"epoch": 0.45013850415512463,
"grad_norm": 5.417855739593506,
"learning_rate": 3.4181244489814865e-05,
"loss": 0.5197,
"step": 3250
},
{
"epoch": 0.4515235457063712,
"grad_norm": 3.584216833114624,
"learning_rate": 3.416268386617791e-05,
"loss": 0.4556,
"step": 3260
},
{
"epoch": 0.45290858725761773,
"grad_norm": 3.537973165512085,
"learning_rate": 3.414412324254095e-05,
"loss": 0.4912,
"step": 3270
},
{
"epoch": 0.45429362880886426,
"grad_norm": 3.106688976287842,
"learning_rate": 3.4125562618903996e-05,
"loss": 0.4669,
"step": 3280
},
{
"epoch": 0.4556786703601108,
"grad_norm": 4.34760046005249,
"learning_rate": 3.410700199526704e-05,
"loss": 0.4704,
"step": 3290
},
{
"epoch": 0.45706371191135736,
"grad_norm": 3.2222983837127686,
"learning_rate": 3.408844137163009e-05,
"loss": 0.4986,
"step": 3300
},
{
"epoch": 0.4584487534626039,
"grad_norm": 3.6748571395874023,
"learning_rate": 3.4069880747993135e-05,
"loss": 0.4944,
"step": 3310
},
{
"epoch": 0.4598337950138504,
"grad_norm": 2.6829590797424316,
"learning_rate": 3.4051320124356185e-05,
"loss": 0.4449,
"step": 3320
},
{
"epoch": 0.46121883656509693,
"grad_norm": 2.984292984008789,
"learning_rate": 3.403275950071923e-05,
"loss": 0.4484,
"step": 3330
},
{
"epoch": 0.4626038781163435,
"grad_norm": 4.855510234832764,
"learning_rate": 3.401419887708227e-05,
"loss": 0.5278,
"step": 3340
},
{
"epoch": 0.46398891966759004,
"grad_norm": 3.2982306480407715,
"learning_rate": 3.3995638253445317e-05,
"loss": 0.5044,
"step": 3350
},
{
"epoch": 0.46537396121883656,
"grad_norm": 6.512004852294922,
"learning_rate": 3.397707762980837e-05,
"loss": 0.471,
"step": 3360
},
{
"epoch": 0.4667590027700831,
"grad_norm": 4.388622760772705,
"learning_rate": 3.395851700617141e-05,
"loss": 0.4689,
"step": 3370
},
{
"epoch": 0.46814404432132967,
"grad_norm": 3.218029022216797,
"learning_rate": 3.3939956382534455e-05,
"loss": 0.4975,
"step": 3380
},
{
"epoch": 0.4695290858725762,
"grad_norm": 3.2648587226867676,
"learning_rate": 3.39213957588975e-05,
"loss": 0.4542,
"step": 3390
},
{
"epoch": 0.4709141274238227,
"grad_norm": 4.079712390899658,
"learning_rate": 3.390283513526054e-05,
"loss": 0.4281,
"step": 3400
},
{
"epoch": 0.47229916897506924,
"grad_norm": 4.063242435455322,
"learning_rate": 3.388427451162359e-05,
"loss": 0.5375,
"step": 3410
},
{
"epoch": 0.47368421052631576,
"grad_norm": 3.1259000301361084,
"learning_rate": 3.386571388798664e-05,
"loss": 0.4897,
"step": 3420
},
{
"epoch": 0.47506925207756234,
"grad_norm": 6.046876430511475,
"learning_rate": 3.384715326434969e-05,
"loss": 0.4871,
"step": 3430
},
{
"epoch": 0.47645429362880887,
"grad_norm": 23.784345626831055,
"learning_rate": 3.382859264071273e-05,
"loss": 0.4712,
"step": 3440
},
{
"epoch": 0.4778393351800554,
"grad_norm": 4.167202472686768,
"learning_rate": 3.3810032017075775e-05,
"loss": 0.4637,
"step": 3450
},
{
"epoch": 0.4792243767313019,
"grad_norm": 7.40032434463501,
"learning_rate": 3.379147139343882e-05,
"loss": 0.4799,
"step": 3460
},
{
"epoch": 0.4806094182825485,
"grad_norm": 3.7741355895996094,
"learning_rate": 3.377291076980187e-05,
"loss": 0.513,
"step": 3470
},
{
"epoch": 0.481994459833795,
"grad_norm": 2.4384195804595947,
"learning_rate": 3.375435014616491e-05,
"loss": 0.501,
"step": 3480
},
{
"epoch": 0.48337950138504154,
"grad_norm": 5.508775234222412,
"learning_rate": 3.3735789522527964e-05,
"loss": 0.4425,
"step": 3490
},
{
"epoch": 0.48476454293628807,
"grad_norm": 3.220346212387085,
"learning_rate": 3.371722889889101e-05,
"loss": 0.4754,
"step": 3500
},
{
"epoch": 0.48614958448753465,
"grad_norm": 3.7024574279785156,
"learning_rate": 3.369866827525405e-05,
"loss": 0.4953,
"step": 3510
},
{
"epoch": 0.48753462603878117,
"grad_norm": 5.731636047363281,
"learning_rate": 3.3680107651617095e-05,
"loss": 0.4893,
"step": 3520
},
{
"epoch": 0.4889196675900277,
"grad_norm": 2.938441038131714,
"learning_rate": 3.366154702798014e-05,
"loss": 0.4364,
"step": 3530
},
{
"epoch": 0.4903047091412742,
"grad_norm": 3.5388107299804688,
"learning_rate": 3.364298640434319e-05,
"loss": 0.4975,
"step": 3540
},
{
"epoch": 0.4916897506925208,
"grad_norm": 4.601080894470215,
"learning_rate": 3.3624425780706233e-05,
"loss": 0.5368,
"step": 3550
},
{
"epoch": 0.4930747922437673,
"grad_norm": 6.017910480499268,
"learning_rate": 3.3605865157069284e-05,
"loss": 0.4677,
"step": 3560
},
{
"epoch": 0.49445983379501385,
"grad_norm": 4.040421962738037,
"learning_rate": 3.358730453343233e-05,
"loss": 0.4467,
"step": 3570
},
{
"epoch": 0.49584487534626037,
"grad_norm": 4.375626087188721,
"learning_rate": 3.356874390979537e-05,
"loss": 0.4409,
"step": 3580
},
{
"epoch": 0.49722991689750695,
"grad_norm": 3.4123740196228027,
"learning_rate": 3.3550183286158415e-05,
"loss": 0.4369,
"step": 3590
},
{
"epoch": 0.4986149584487535,
"grad_norm": 3.231276750564575,
"learning_rate": 3.353162266252146e-05,
"loss": 0.4736,
"step": 3600
},
{
"epoch": 0.5,
"grad_norm": 6.275581359863281,
"learning_rate": 3.351306203888451e-05,
"loss": 0.464,
"step": 3610
},
{
"epoch": 0.5013850415512465,
"grad_norm": 4.812581539154053,
"learning_rate": 3.3494501415247554e-05,
"loss": 0.4516,
"step": 3620
},
{
"epoch": 0.502770083102493,
"grad_norm": 9.834177017211914,
"learning_rate": 3.3475940791610604e-05,
"loss": 0.4231,
"step": 3630
},
{
"epoch": 0.5041551246537396,
"grad_norm": 3.000638246536255,
"learning_rate": 3.345738016797365e-05,
"loss": 0.3877,
"step": 3640
},
{
"epoch": 0.5055401662049861,
"grad_norm": 3.8207039833068848,
"learning_rate": 3.343881954433669e-05,
"loss": 0.4568,
"step": 3650
},
{
"epoch": 0.5069252077562327,
"grad_norm": 4.181671142578125,
"learning_rate": 3.3420258920699736e-05,
"loss": 0.458,
"step": 3660
},
{
"epoch": 0.5083102493074793,
"grad_norm": 3.0442757606506348,
"learning_rate": 3.3401698297062786e-05,
"loss": 0.4284,
"step": 3670
},
{
"epoch": 0.5096952908587258,
"grad_norm": 4.325800895690918,
"learning_rate": 3.338313767342583e-05,
"loss": 0.4374,
"step": 3680
},
{
"epoch": 0.5110803324099723,
"grad_norm": 4.549467086791992,
"learning_rate": 3.336457704978888e-05,
"loss": 0.4454,
"step": 3690
},
{
"epoch": 0.5124653739612188,
"grad_norm": 3.2602922916412354,
"learning_rate": 3.3346016426151924e-05,
"loss": 0.4707,
"step": 3700
},
{
"epoch": 0.5138504155124654,
"grad_norm": 2.9979166984558105,
"learning_rate": 3.332745580251497e-05,
"loss": 0.4444,
"step": 3710
},
{
"epoch": 0.5152354570637119,
"grad_norm": 3.0722646713256836,
"learning_rate": 3.330889517887801e-05,
"loss": 0.4034,
"step": 3720
},
{
"epoch": 0.5166204986149584,
"grad_norm": 3.6399686336517334,
"learning_rate": 3.3290334555241056e-05,
"loss": 0.4298,
"step": 3730
},
{
"epoch": 0.518005540166205,
"grad_norm": 3.675416946411133,
"learning_rate": 3.3271773931604107e-05,
"loss": 0.4135,
"step": 3740
},
{
"epoch": 0.5193905817174516,
"grad_norm": 4.051141738891602,
"learning_rate": 3.325321330796715e-05,
"loss": 0.4872,
"step": 3750
},
{
"epoch": 0.5207756232686981,
"grad_norm": 3.5294289588928223,
"learning_rate": 3.32346526843302e-05,
"loss": 0.4141,
"step": 3760
},
{
"epoch": 0.5221606648199446,
"grad_norm": 4.430352687835693,
"learning_rate": 3.321609206069324e-05,
"loss": 0.4086,
"step": 3770
},
{
"epoch": 0.5235457063711911,
"grad_norm": 3.236701011657715,
"learning_rate": 3.319753143705629e-05,
"loss": 0.4745,
"step": 3780
},
{
"epoch": 0.5249307479224377,
"grad_norm": 3.0971288681030273,
"learning_rate": 3.317897081341933e-05,
"loss": 0.4963,
"step": 3790
},
{
"epoch": 0.5263157894736842,
"grad_norm": 4.079372406005859,
"learning_rate": 3.316041018978238e-05,
"loss": 0.4894,
"step": 3800
},
{
"epoch": 0.5277008310249307,
"grad_norm": 4.282620906829834,
"learning_rate": 3.314184956614543e-05,
"loss": 0.4807,
"step": 3810
},
{
"epoch": 0.5290858725761773,
"grad_norm": 2.9405722618103027,
"learning_rate": 3.312328894250847e-05,
"loss": 0.4863,
"step": 3820
},
{
"epoch": 0.5304709141274239,
"grad_norm": 2.777052402496338,
"learning_rate": 3.3104728318871514e-05,
"loss": 0.4326,
"step": 3830
},
{
"epoch": 0.5318559556786704,
"grad_norm": 2.2695729732513428,
"learning_rate": 3.308616769523456e-05,
"loss": 0.4694,
"step": 3840
},
{
"epoch": 0.5332409972299169,
"grad_norm": 3.705563545227051,
"learning_rate": 3.306760707159761e-05,
"loss": 0.4308,
"step": 3850
},
{
"epoch": 0.5346260387811634,
"grad_norm": 3.7749478816986084,
"learning_rate": 3.304904644796065e-05,
"loss": 0.4174,
"step": 3860
},
{
"epoch": 0.53601108033241,
"grad_norm": 4.059981822967529,
"learning_rate": 3.30304858243237e-05,
"loss": 0.4492,
"step": 3870
},
{
"epoch": 0.5373961218836565,
"grad_norm": 3.0971269607543945,
"learning_rate": 3.301192520068675e-05,
"loss": 0.4399,
"step": 3880
},
{
"epoch": 0.538781163434903,
"grad_norm": 4.655627250671387,
"learning_rate": 3.299336457704979e-05,
"loss": 0.4115,
"step": 3890
},
{
"epoch": 0.5401662049861495,
"grad_norm": 4.1396684646606445,
"learning_rate": 3.2974803953412835e-05,
"loss": 0.4738,
"step": 3900
},
{
"epoch": 0.5415512465373962,
"grad_norm": 3.7863826751708984,
"learning_rate": 3.295624332977588e-05,
"loss": 0.474,
"step": 3910
},
{
"epoch": 0.5429362880886427,
"grad_norm": 2.854278087615967,
"learning_rate": 3.293768270613893e-05,
"loss": 0.4478,
"step": 3920
},
{
"epoch": 0.5443213296398892,
"grad_norm": 5.139938831329346,
"learning_rate": 3.291912208250197e-05,
"loss": 0.484,
"step": 3930
},
{
"epoch": 0.5457063711911357,
"grad_norm": 3.8379576206207275,
"learning_rate": 3.290056145886502e-05,
"loss": 0.4665,
"step": 3940
},
{
"epoch": 0.5470914127423823,
"grad_norm": 3.043219566345215,
"learning_rate": 3.288200083522807e-05,
"loss": 0.484,
"step": 3950
},
{
"epoch": 0.5484764542936288,
"grad_norm": 3.708510398864746,
"learning_rate": 3.286344021159111e-05,
"loss": 0.4471,
"step": 3960
},
{
"epoch": 0.5498614958448753,
"grad_norm": 5.9408159255981445,
"learning_rate": 3.2844879587954155e-05,
"loss": 0.4637,
"step": 3970
},
{
"epoch": 0.5512465373961218,
"grad_norm": 2.6901965141296387,
"learning_rate": 3.2826318964317205e-05,
"loss": 0.4472,
"step": 3980
},
{
"epoch": 0.5526315789473685,
"grad_norm": 3.6067564487457275,
"learning_rate": 3.280775834068025e-05,
"loss": 0.4264,
"step": 3990
},
{
"epoch": 0.554016620498615,
"grad_norm": 4.2996344566345215,
"learning_rate": 3.27891977170433e-05,
"loss": 0.4591,
"step": 4000
},
{
"epoch": 0.5554016620498615,
"grad_norm": 3.5069291591644287,
"learning_rate": 3.2770637093406344e-05,
"loss": 0.4149,
"step": 4010
},
{
"epoch": 0.556786703601108,
"grad_norm": 3.3348257541656494,
"learning_rate": 3.275207646976939e-05,
"loss": 0.4734,
"step": 4020
},
{
"epoch": 0.5581717451523546,
"grad_norm": 3.276700258255005,
"learning_rate": 3.273351584613243e-05,
"loss": 0.4616,
"step": 4030
},
{
"epoch": 0.5595567867036011,
"grad_norm": 2.2809834480285645,
"learning_rate": 3.2714955222495475e-05,
"loss": 0.4587,
"step": 4040
},
{
"epoch": 0.5609418282548476,
"grad_norm": 4.095324993133545,
"learning_rate": 3.2696394598858526e-05,
"loss": 0.441,
"step": 4050
},
{
"epoch": 0.5623268698060941,
"grad_norm": 3.506425619125366,
"learning_rate": 3.267783397522157e-05,
"loss": 0.4117,
"step": 4060
},
{
"epoch": 0.5637119113573407,
"grad_norm": 2.7167859077453613,
"learning_rate": 3.265927335158462e-05,
"loss": 0.4472,
"step": 4070
},
{
"epoch": 0.5650969529085873,
"grad_norm": 2.6138830184936523,
"learning_rate": 3.2640712727947664e-05,
"loss": 0.4344,
"step": 4080
},
{
"epoch": 0.5664819944598338,
"grad_norm": 4.395142078399658,
"learning_rate": 3.262215210431071e-05,
"loss": 0.3822,
"step": 4090
},
{
"epoch": 0.5678670360110804,
"grad_norm": 4.22299861907959,
"learning_rate": 3.260359148067375e-05,
"loss": 0.439,
"step": 4100
},
{
"epoch": 0.5692520775623269,
"grad_norm": 2.483031988143921,
"learning_rate": 3.25850308570368e-05,
"loss": 0.4168,
"step": 4110
},
{
"epoch": 0.5706371191135734,
"grad_norm": 5.224299430847168,
"learning_rate": 3.2566470233399846e-05,
"loss": 0.4622,
"step": 4120
},
{
"epoch": 0.5720221606648199,
"grad_norm": 4.899472713470459,
"learning_rate": 3.254790960976289e-05,
"loss": 0.4427,
"step": 4130
},
{
"epoch": 0.5734072022160664,
"grad_norm": 4.368985652923584,
"learning_rate": 3.252934898612594e-05,
"loss": 0.4067,
"step": 4140
},
{
"epoch": 0.574792243767313,
"grad_norm": 2.419369697570801,
"learning_rate": 3.2510788362488984e-05,
"loss": 0.4096,
"step": 4150
},
{
"epoch": 0.5761772853185596,
"grad_norm": 3.5503952503204346,
"learning_rate": 3.249222773885203e-05,
"loss": 0.4385,
"step": 4160
},
{
"epoch": 0.5775623268698061,
"grad_norm": 4.396968841552734,
"learning_rate": 3.247366711521507e-05,
"loss": 0.4686,
"step": 4170
},
{
"epoch": 0.5789473684210527,
"grad_norm": 2.777759313583374,
"learning_rate": 3.245510649157812e-05,
"loss": 0.4215,
"step": 4180
},
{
"epoch": 0.5803324099722992,
"grad_norm": 5.332396030426025,
"learning_rate": 3.2436545867941166e-05,
"loss": 0.4416,
"step": 4190
},
{
"epoch": 0.5817174515235457,
"grad_norm": 2.5784120559692383,
"learning_rate": 3.241798524430421e-05,
"loss": 0.425,
"step": 4200
},
{
"epoch": 0.5831024930747922,
"grad_norm": 2.983954668045044,
"learning_rate": 3.2399424620667254e-05,
"loss": 0.4213,
"step": 4210
},
{
"epoch": 0.5844875346260388,
"grad_norm": 3.8476195335388184,
"learning_rate": 3.23808639970303e-05,
"loss": 0.4716,
"step": 4220
},
{
"epoch": 0.5858725761772853,
"grad_norm": 12.473455429077148,
"learning_rate": 3.236230337339335e-05,
"loss": 0.428,
"step": 4230
},
{
"epoch": 0.5872576177285319,
"grad_norm": 2.7237050533294678,
"learning_rate": 3.234374274975639e-05,
"loss": 0.4505,
"step": 4240
},
{
"epoch": 0.5886426592797784,
"grad_norm": 3.166870594024658,
"learning_rate": 3.232518212611944e-05,
"loss": 0.4082,
"step": 4250
},
{
"epoch": 0.590027700831025,
"grad_norm": 2.7044925689697266,
"learning_rate": 3.2306621502482486e-05,
"loss": 0.451,
"step": 4260
},
{
"epoch": 0.5914127423822715,
"grad_norm": 2.526685953140259,
"learning_rate": 3.228806087884553e-05,
"loss": 0.3867,
"step": 4270
},
{
"epoch": 0.592797783933518,
"grad_norm": 3.4184718132019043,
"learning_rate": 3.2269500255208574e-05,
"loss": 0.4196,
"step": 4280
},
{
"epoch": 0.5941828254847645,
"grad_norm": 2.575986862182617,
"learning_rate": 3.2250939631571625e-05,
"loss": 0.4277,
"step": 4290
},
{
"epoch": 0.5955678670360111,
"grad_norm": 8.845891952514648,
"learning_rate": 3.223237900793467e-05,
"loss": 0.4054,
"step": 4300
},
{
"epoch": 0.5969529085872576,
"grad_norm": 2.8799660205841064,
"learning_rate": 3.221381838429772e-05,
"loss": 0.4524,
"step": 4310
},
{
"epoch": 0.5983379501385041,
"grad_norm": 2.95517897605896,
"learning_rate": 3.219525776066076e-05,
"loss": 0.4075,
"step": 4320
},
{
"epoch": 0.5997229916897507,
"grad_norm": 3.06547212600708,
"learning_rate": 3.2176697137023807e-05,
"loss": 0.4467,
"step": 4330
},
{
"epoch": 0.6011080332409973,
"grad_norm": 4.401691913604736,
"learning_rate": 3.215813651338685e-05,
"loss": 0.4717,
"step": 4340
},
{
"epoch": 0.6024930747922438,
"grad_norm": 3.3205084800720215,
"learning_rate": 3.2139575889749894e-05,
"loss": 0.4815,
"step": 4350
},
{
"epoch": 0.6038781163434903,
"grad_norm": 4.638674736022949,
"learning_rate": 3.2121015266112945e-05,
"loss": 0.4042,
"step": 4360
},
{
"epoch": 0.6052631578947368,
"grad_norm": 2.9574596881866455,
"learning_rate": 3.210245464247599e-05,
"loss": 0.4469,
"step": 4370
},
{
"epoch": 0.6066481994459834,
"grad_norm": 4.905550956726074,
"learning_rate": 3.208389401883904e-05,
"loss": 0.4386,
"step": 4380
},
{
"epoch": 0.6080332409972299,
"grad_norm": 4.671908855438232,
"learning_rate": 3.206533339520208e-05,
"loss": 0.4219,
"step": 4390
},
{
"epoch": 0.6094182825484764,
"grad_norm": 3.3245506286621094,
"learning_rate": 3.204677277156513e-05,
"loss": 0.4523,
"step": 4400
},
{
"epoch": 0.610803324099723,
"grad_norm": 4.6899285316467285,
"learning_rate": 3.202821214792817e-05,
"loss": 0.4662,
"step": 4410
},
{
"epoch": 0.6121883656509696,
"grad_norm": 3.6640164852142334,
"learning_rate": 3.200965152429122e-05,
"loss": 0.3351,
"step": 4420
},
{
"epoch": 0.6135734072022161,
"grad_norm": 3.060845375061035,
"learning_rate": 3.1991090900654265e-05,
"loss": 0.3734,
"step": 4430
},
{
"epoch": 0.6149584487534626,
"grad_norm": 3.4755430221557617,
"learning_rate": 3.197253027701731e-05,
"loss": 0.4375,
"step": 4440
},
{
"epoch": 0.6163434903047091,
"grad_norm": 3.331393241882324,
"learning_rate": 3.195396965338036e-05,
"loss": 0.4491,
"step": 4450
},
{
"epoch": 0.6177285318559557,
"grad_norm": 3.3888213634490967,
"learning_rate": 3.19354090297434e-05,
"loss": 0.3774,
"step": 4460
},
{
"epoch": 0.6191135734072022,
"grad_norm": 3.0479416847229004,
"learning_rate": 3.191684840610645e-05,
"loss": 0.3667,
"step": 4470
},
{
"epoch": 0.6204986149584487,
"grad_norm": 3.2111737728118896,
"learning_rate": 3.189828778246949e-05,
"loss": 0.4133,
"step": 4480
},
{
"epoch": 0.6218836565096952,
"grad_norm": 3.8854033946990967,
"learning_rate": 3.187972715883254e-05,
"loss": 0.4047,
"step": 4490
},
{
"epoch": 0.6232686980609419,
"grad_norm": 2.769989252090454,
"learning_rate": 3.1861166535195585e-05,
"loss": 0.4135,
"step": 4500
},
{
"epoch": 0.6246537396121884,
"grad_norm": 3.127634048461914,
"learning_rate": 3.1842605911558636e-05,
"loss": 0.4154,
"step": 4510
},
{
"epoch": 0.6260387811634349,
"grad_norm": 3.8508613109588623,
"learning_rate": 3.182404528792168e-05,
"loss": 0.4187,
"step": 4520
},
{
"epoch": 0.6274238227146814,
"grad_norm": 3.4920480251312256,
"learning_rate": 3.1805484664284723e-05,
"loss": 0.4257,
"step": 4530
},
{
"epoch": 0.628808864265928,
"grad_norm": 1.9404661655426025,
"learning_rate": 3.178692404064777e-05,
"loss": 0.4067,
"step": 4540
},
{
"epoch": 0.6301939058171745,
"grad_norm": 2.8970749378204346,
"learning_rate": 3.176836341701081e-05,
"loss": 0.4321,
"step": 4550
},
{
"epoch": 0.631578947368421,
"grad_norm": 2.96341609954834,
"learning_rate": 3.174980279337386e-05,
"loss": 0.4049,
"step": 4560
},
{
"epoch": 0.6329639889196675,
"grad_norm": 2.9328839778900146,
"learning_rate": 3.1731242169736905e-05,
"loss": 0.3811,
"step": 4570
},
{
"epoch": 0.6343490304709142,
"grad_norm": 3.4064273834228516,
"learning_rate": 3.171268154609995e-05,
"loss": 0.4501,
"step": 4580
},
{
"epoch": 0.6357340720221607,
"grad_norm": 4.2957682609558105,
"learning_rate": 3.169597698482669e-05,
"loss": 0.4221,
"step": 4590
},
{
"epoch": 0.6371191135734072,
"grad_norm": 2.247894763946533,
"learning_rate": 3.1677416361189736e-05,
"loss": 0.391,
"step": 4600
},
{
"epoch": 0.6385041551246537,
"grad_norm": 3.282683849334717,
"learning_rate": 3.1658855737552786e-05,
"loss": 0.4178,
"step": 4610
},
{
"epoch": 0.6398891966759003,
"grad_norm": 7.642999172210693,
"learning_rate": 3.164029511391583e-05,
"loss": 0.4455,
"step": 4620
},
{
"epoch": 0.6412742382271468,
"grad_norm": 2.312368392944336,
"learning_rate": 3.1621734490278874e-05,
"loss": 0.4496,
"step": 4630
},
{
"epoch": 0.6426592797783933,
"grad_norm": 3.5621540546417236,
"learning_rate": 3.160317386664192e-05,
"loss": 0.4543,
"step": 4640
},
{
"epoch": 0.6440443213296398,
"grad_norm": 3.406416416168213,
"learning_rate": 3.158461324300497e-05,
"loss": 0.455,
"step": 4650
},
{
"epoch": 0.6454293628808865,
"grad_norm": 3.192976236343384,
"learning_rate": 3.156605261936801e-05,
"loss": 0.4025,
"step": 4660
},
{
"epoch": 0.646814404432133,
"grad_norm": 5.069347858428955,
"learning_rate": 3.154749199573106e-05,
"loss": 0.3751,
"step": 4670
},
{
"epoch": 0.6481994459833795,
"grad_norm": 4.459850311279297,
"learning_rate": 3.152893137209411e-05,
"loss": 0.4012,
"step": 4680
},
{
"epoch": 0.649584487534626,
"grad_norm": 2.7644102573394775,
"learning_rate": 3.151037074845715e-05,
"loss": 0.3464,
"step": 4690
},
{
"epoch": 0.6509695290858726,
"grad_norm": 3.0112130641937256,
"learning_rate": 3.1491810124820194e-05,
"loss": 0.4453,
"step": 4700
},
{
"epoch": 0.6523545706371191,
"grad_norm": 4.661027908325195,
"learning_rate": 3.147324950118324e-05,
"loss": 0.4233,
"step": 4710
},
{
"epoch": 0.6537396121883656,
"grad_norm": 5.308691501617432,
"learning_rate": 3.145468887754629e-05,
"loss": 0.4013,
"step": 4720
},
{
"epoch": 0.6551246537396122,
"grad_norm": 3.739926815032959,
"learning_rate": 3.143612825390933e-05,
"loss": 0.3827,
"step": 4730
},
{
"epoch": 0.6565096952908587,
"grad_norm": 3.5572757720947266,
"learning_rate": 3.141756763027238e-05,
"loss": 0.3911,
"step": 4740
},
{
"epoch": 0.6578947368421053,
"grad_norm": 3.1638436317443848,
"learning_rate": 3.139900700663543e-05,
"loss": 0.3802,
"step": 4750
},
{
"epoch": 0.6592797783933518,
"grad_norm": 3.4495441913604736,
"learning_rate": 3.138044638299847e-05,
"loss": 0.4217,
"step": 4760
},
{
"epoch": 0.6606648199445984,
"grad_norm": 3.4912328720092773,
"learning_rate": 3.1361885759361515e-05,
"loss": 0.3968,
"step": 4770
},
{
"epoch": 0.6620498614958449,
"grad_norm": 4.580535888671875,
"learning_rate": 3.1343325135724565e-05,
"loss": 0.4025,
"step": 4780
},
{
"epoch": 0.6634349030470914,
"grad_norm": 2.982192039489746,
"learning_rate": 3.132476451208761e-05,
"loss": 0.4258,
"step": 4790
},
{
"epoch": 0.6648199445983379,
"grad_norm": 3.1720199584960938,
"learning_rate": 3.130620388845065e-05,
"loss": 0.4045,
"step": 4800
},
{
"epoch": 0.6662049861495845,
"grad_norm": 3.1427557468414307,
"learning_rate": 3.12876432648137e-05,
"loss": 0.4073,
"step": 4810
},
{
"epoch": 0.667590027700831,
"grad_norm": 3.666034698486328,
"learning_rate": 3.126908264117675e-05,
"loss": 0.4056,
"step": 4820
},
{
"epoch": 0.6689750692520776,
"grad_norm": 4.027162075042725,
"learning_rate": 3.125052201753979e-05,
"loss": 0.3917,
"step": 4830
},
{
"epoch": 0.6703601108033241,
"grad_norm": 4.736123561859131,
"learning_rate": 3.1231961393902835e-05,
"loss": 0.4332,
"step": 4840
},
{
"epoch": 0.6717451523545707,
"grad_norm": 3.1902873516082764,
"learning_rate": 3.1213400770265885e-05,
"loss": 0.4597,
"step": 4850
},
{
"epoch": 0.6731301939058172,
"grad_norm": 3.289825916290283,
"learning_rate": 3.119484014662893e-05,
"loss": 0.4102,
"step": 4860
},
{
"epoch": 0.6745152354570637,
"grad_norm": 3.381176471710205,
"learning_rate": 3.117627952299198e-05,
"loss": 0.3899,
"step": 4870
},
{
"epoch": 0.6759002770083102,
"grad_norm": 3.6667535305023193,
"learning_rate": 3.1157718899355024e-05,
"loss": 0.3948,
"step": 4880
},
{
"epoch": 0.6772853185595568,
"grad_norm": 4.411403656005859,
"learning_rate": 3.113915827571807e-05,
"loss": 0.396,
"step": 4890
},
{
"epoch": 0.6786703601108033,
"grad_norm": 3.952453136444092,
"learning_rate": 3.112059765208111e-05,
"loss": 0.3817,
"step": 4900
},
{
"epoch": 0.6800554016620498,
"grad_norm": 4.688000679016113,
"learning_rate": 3.1102037028444155e-05,
"loss": 0.4643,
"step": 4910
},
{
"epoch": 0.6814404432132964,
"grad_norm": 3.189419984817505,
"learning_rate": 3.1083476404807206e-05,
"loss": 0.4133,
"step": 4920
},
{
"epoch": 0.682825484764543,
"grad_norm": 2.98185658454895,
"learning_rate": 3.106491578117025e-05,
"loss": 0.3862,
"step": 4930
},
{
"epoch": 0.6842105263157895,
"grad_norm": 3.0567667484283447,
"learning_rate": 3.10463551575333e-05,
"loss": 0.4629,
"step": 4940
},
{
"epoch": 0.685595567867036,
"grad_norm": 2.5653200149536133,
"learning_rate": 3.102779453389634e-05,
"loss": 0.4463,
"step": 4950
},
{
"epoch": 0.6869806094182825,
"grad_norm": 4.169416427612305,
"learning_rate": 3.100923391025939e-05,
"loss": 0.4068,
"step": 4960
},
{
"epoch": 0.6883656509695291,
"grad_norm": 2.639570951461792,
"learning_rate": 3.099067328662243e-05,
"loss": 0.4079,
"step": 4970
},
{
"epoch": 0.6897506925207756,
"grad_norm": 2.7639517784118652,
"learning_rate": 3.097211266298548e-05,
"loss": 0.3994,
"step": 4980
},
{
"epoch": 0.6911357340720221,
"grad_norm": 3.754966974258423,
"learning_rate": 3.0953552039348526e-05,
"loss": 0.377,
"step": 4990
},
{
"epoch": 0.6925207756232687,
"grad_norm": 7.626768589019775,
"learning_rate": 3.093499141571157e-05,
"loss": 0.4358,
"step": 5000
},
{
"epoch": 0.6939058171745153,
"grad_norm": 2.9823813438415527,
"learning_rate": 3.0916430792074613e-05,
"loss": 0.412,
"step": 5010
},
{
"epoch": 0.6952908587257618,
"grad_norm": 2.7320168018341064,
"learning_rate": 3.089787016843766e-05,
"loss": 0.3826,
"step": 5020
},
{
"epoch": 0.6966759002770083,
"grad_norm": 2.937063694000244,
"learning_rate": 3.087930954480071e-05,
"loss": 0.368,
"step": 5030
},
{
"epoch": 0.6980609418282548,
"grad_norm": 3.7012999057769775,
"learning_rate": 3.086074892116375e-05,
"loss": 0.4141,
"step": 5040
},
{
"epoch": 0.6994459833795014,
"grad_norm": 3.345500946044922,
"learning_rate": 3.08421882975268e-05,
"loss": 0.4539,
"step": 5050
},
{
"epoch": 0.7008310249307479,
"grad_norm": 2.865732192993164,
"learning_rate": 3.0823627673889846e-05,
"loss": 0.404,
"step": 5060
},
{
"epoch": 0.7022160664819944,
"grad_norm": 3.1072158813476562,
"learning_rate": 3.080506705025289e-05,
"loss": 0.4306,
"step": 5070
},
{
"epoch": 0.703601108033241,
"grad_norm": 3.3509767055511475,
"learning_rate": 3.0786506426615934e-05,
"loss": 0.4512,
"step": 5080
},
{
"epoch": 0.7049861495844876,
"grad_norm": 3.022902727127075,
"learning_rate": 3.0767945802978984e-05,
"loss": 0.4477,
"step": 5090
},
{
"epoch": 0.7063711911357341,
"grad_norm": 3.4046220779418945,
"learning_rate": 3.074938517934203e-05,
"loss": 0.384,
"step": 5100
},
{
"epoch": 0.7077562326869806,
"grad_norm": 4.66195011138916,
"learning_rate": 3.073082455570507e-05,
"loss": 0.436,
"step": 5110
},
{
"epoch": 0.7091412742382271,
"grad_norm": 2.5576562881469727,
"learning_rate": 3.071226393206812e-05,
"loss": 0.3902,
"step": 5120
},
{
"epoch": 0.7105263157894737,
"grad_norm": 3.4099183082580566,
"learning_rate": 3.0693703308431166e-05,
"loss": 0.4065,
"step": 5130
},
{
"epoch": 0.7119113573407202,
"grad_norm": 2.215935230255127,
"learning_rate": 3.067514268479421e-05,
"loss": 0.3731,
"step": 5140
},
{
"epoch": 0.7132963988919667,
"grad_norm": 5.798324108123779,
"learning_rate": 3.0656582061157254e-05,
"loss": 0.4359,
"step": 5150
},
{
"epoch": 0.7146814404432132,
"grad_norm": 1.9546353816986084,
"learning_rate": 3.0638021437520305e-05,
"loss": 0.3753,
"step": 5160
},
{
"epoch": 0.7160664819944599,
"grad_norm": 2.7487800121307373,
"learning_rate": 3.061946081388335e-05,
"loss": 0.4366,
"step": 5170
},
{
"epoch": 0.7174515235457064,
"grad_norm": 4.348243713378906,
"learning_rate": 3.06009001902464e-05,
"loss": 0.3798,
"step": 5180
},
{
"epoch": 0.7188365650969529,
"grad_norm": 5.212021350860596,
"learning_rate": 3.058233956660944e-05,
"loss": 0.3622,
"step": 5190
},
{
"epoch": 0.7202216066481995,
"grad_norm": 2.5864319801330566,
"learning_rate": 3.0563778942972487e-05,
"loss": 0.4204,
"step": 5200
},
{
"epoch": 0.721606648199446,
"grad_norm": 4.407830715179443,
"learning_rate": 3.054521831933553e-05,
"loss": 0.3879,
"step": 5210
},
{
"epoch": 0.7229916897506925,
"grad_norm": 3.6577796936035156,
"learning_rate": 3.0526657695698574e-05,
"loss": 0.3786,
"step": 5220
},
{
"epoch": 0.724376731301939,
"grad_norm": 3.39837384223938,
"learning_rate": 3.0508097072061625e-05,
"loss": 0.402,
"step": 5230
},
{
"epoch": 0.7257617728531855,
"grad_norm": 3.36942982673645,
"learning_rate": 3.048953644842467e-05,
"loss": 0.3856,
"step": 5240
},
{
"epoch": 0.7271468144044322,
"grad_norm": 5.817683219909668,
"learning_rate": 3.0470975824787716e-05,
"loss": 0.4173,
"step": 5250
},
{
"epoch": 0.7285318559556787,
"grad_norm": 3.100637674331665,
"learning_rate": 3.045241520115076e-05,
"loss": 0.3486,
"step": 5260
},
{
"epoch": 0.7299168975069252,
"grad_norm": 6.541337490081787,
"learning_rate": 3.0433854577513807e-05,
"loss": 0.4273,
"step": 5270
},
{
"epoch": 0.7313019390581718,
"grad_norm": 2.8053693771362305,
"learning_rate": 3.041529395387685e-05,
"loss": 0.4533,
"step": 5280
},
{
"epoch": 0.7326869806094183,
"grad_norm": 2.527963638305664,
"learning_rate": 3.03967333302399e-05,
"loss": 0.3753,
"step": 5290
},
{
"epoch": 0.7340720221606648,
"grad_norm": 2.9458463191986084,
"learning_rate": 3.0378172706602945e-05,
"loss": 0.4179,
"step": 5300
},
{
"epoch": 0.7354570637119113,
"grad_norm": 3.373678207397461,
"learning_rate": 3.035961208296599e-05,
"loss": 0.4206,
"step": 5310
},
{
"epoch": 0.7368421052631579,
"grad_norm": 2.686825752258301,
"learning_rate": 3.0341051459329036e-05,
"loss": 0.3849,
"step": 5320
},
{
"epoch": 0.7382271468144044,
"grad_norm": 2.6222784519195557,
"learning_rate": 3.032249083569208e-05,
"loss": 0.3689,
"step": 5330
},
{
"epoch": 0.739612188365651,
"grad_norm": 3.494692087173462,
"learning_rate": 3.0303930212055127e-05,
"loss": 0.4181,
"step": 5340
},
{
"epoch": 0.7409972299168975,
"grad_norm": 4.222794055938721,
"learning_rate": 3.028536958841817e-05,
"loss": 0.4103,
"step": 5350
},
{
"epoch": 0.7423822714681441,
"grad_norm": 2.4981813430786133,
"learning_rate": 3.026680896478122e-05,
"loss": 0.3996,
"step": 5360
},
{
"epoch": 0.7437673130193906,
"grad_norm": 4.366548538208008,
"learning_rate": 3.0248248341144265e-05,
"loss": 0.3658,
"step": 5370
},
{
"epoch": 0.7451523545706371,
"grad_norm": 3.9107649326324463,
"learning_rate": 3.0229687717507312e-05,
"loss": 0.4083,
"step": 5380
},
{
"epoch": 0.7465373961218836,
"grad_norm": 2.8175508975982666,
"learning_rate": 3.0211127093870356e-05,
"loss": 0.3993,
"step": 5390
},
{
"epoch": 0.7479224376731302,
"grad_norm": 3.2297942638397217,
"learning_rate": 3.0192566470233403e-05,
"loss": 0.3592,
"step": 5400
},
{
"epoch": 0.7493074792243767,
"grad_norm": 6.544715404510498,
"learning_rate": 3.0174005846596447e-05,
"loss": 0.4534,
"step": 5410
},
{
"epoch": 0.7506925207756233,
"grad_norm": 4.909923553466797,
"learning_rate": 3.015544522295949e-05,
"loss": 0.4161,
"step": 5420
},
{
"epoch": 0.7520775623268698,
"grad_norm": 4.12788724899292,
"learning_rate": 3.013688459932254e-05,
"loss": 0.3657,
"step": 5430
},
{
"epoch": 0.7534626038781164,
"grad_norm": 4.8378071784973145,
"learning_rate": 3.0118323975685585e-05,
"loss": 0.3741,
"step": 5440
},
{
"epoch": 0.7548476454293629,
"grad_norm": 2.4138081073760986,
"learning_rate": 3.0099763352048633e-05,
"loss": 0.3844,
"step": 5450
},
{
"epoch": 0.7562326869806094,
"grad_norm": 3.8834354877471924,
"learning_rate": 3.0081202728411676e-05,
"loss": 0.3943,
"step": 5460
},
{
"epoch": 0.7576177285318559,
"grad_norm": 5.033694744110107,
"learning_rate": 3.0062642104774724e-05,
"loss": 0.3897,
"step": 5470
},
{
"epoch": 0.7590027700831025,
"grad_norm": 3.515544891357422,
"learning_rate": 3.0044081481137767e-05,
"loss": 0.363,
"step": 5480
},
{
"epoch": 0.760387811634349,
"grad_norm": 4.053267002105713,
"learning_rate": 3.0025520857500815e-05,
"loss": 0.3566,
"step": 5490
},
{
"epoch": 0.7617728531855956,
"grad_norm": 2.7113759517669678,
"learning_rate": 3.000696023386386e-05,
"loss": 0.358,
"step": 5500
},
{
"epoch": 0.7631578947368421,
"grad_norm": 2.1109673976898193,
"learning_rate": 2.998839961022691e-05,
"loss": 0.4205,
"step": 5510
},
{
"epoch": 0.7645429362880887,
"grad_norm": 4.754786014556885,
"learning_rate": 2.9969838986589953e-05,
"loss": 0.414,
"step": 5520
},
{
"epoch": 0.7659279778393352,
"grad_norm": 2.2650856971740723,
"learning_rate": 2.9951278362952997e-05,
"loss": 0.3979,
"step": 5530
},
{
"epoch": 0.7673130193905817,
"grad_norm": 2.702939033508301,
"learning_rate": 2.9932717739316044e-05,
"loss": 0.3778,
"step": 5540
},
{
"epoch": 0.7686980609418282,
"grad_norm": 4.2221455574035645,
"learning_rate": 2.9914157115679088e-05,
"loss": 0.3941,
"step": 5550
},
{
"epoch": 0.7700831024930748,
"grad_norm": 2.9708549976348877,
"learning_rate": 2.9895596492042135e-05,
"loss": 0.4391,
"step": 5560
},
{
"epoch": 0.7714681440443213,
"grad_norm": 4.177848815917969,
"learning_rate": 2.987703586840518e-05,
"loss": 0.3986,
"step": 5570
},
{
"epoch": 0.7728531855955678,
"grad_norm": 5.936276912689209,
"learning_rate": 2.985847524476823e-05,
"loss": 0.3538,
"step": 5580
},
{
"epoch": 0.7742382271468145,
"grad_norm": 2.817993640899658,
"learning_rate": 2.9839914621131273e-05,
"loss": 0.3736,
"step": 5590
},
{
"epoch": 0.775623268698061,
"grad_norm": 3.635772466659546,
"learning_rate": 2.982135399749432e-05,
"loss": 0.3802,
"step": 5600
},
{
"epoch": 0.7770083102493075,
"grad_norm": 4.219478607177734,
"learning_rate": 2.980464943622106e-05,
"loss": 0.4488,
"step": 5610
},
{
"epoch": 0.778393351800554,
"grad_norm": 2.885037422180176,
"learning_rate": 2.9789800937311498e-05,
"loss": 0.4068,
"step": 5620
},
{
"epoch": 0.7797783933518005,
"grad_norm": 4.46284818649292,
"learning_rate": 2.9771240313674542e-05,
"loss": 0.3819,
"step": 5630
},
{
"epoch": 0.7811634349030471,
"grad_norm": 4.2689080238342285,
"learning_rate": 2.975267969003759e-05,
"loss": 0.4599,
"step": 5640
},
{
"epoch": 0.7825484764542936,
"grad_norm": 2.5441861152648926,
"learning_rate": 2.9734119066400633e-05,
"loss": 0.3504,
"step": 5650
},
{
"epoch": 0.7839335180055401,
"grad_norm": 4.003300189971924,
"learning_rate": 2.971555844276368e-05,
"loss": 0.3054,
"step": 5660
},
{
"epoch": 0.7853185595567868,
"grad_norm": 3.1598663330078125,
"learning_rate": 2.9696997819126724e-05,
"loss": 0.3683,
"step": 5670
},
{
"epoch": 0.7867036011080333,
"grad_norm": 3.1675422191619873,
"learning_rate": 2.9678437195489768e-05,
"loss": 0.4038,
"step": 5680
},
{
"epoch": 0.7880886426592798,
"grad_norm": 2.500715732574463,
"learning_rate": 2.965987657185282e-05,
"loss": 0.3432,
"step": 5690
},
{
"epoch": 0.7894736842105263,
"grad_norm": 3.7989180088043213,
"learning_rate": 2.9641315948215862e-05,
"loss": 0.3607,
"step": 5700
},
{
"epoch": 0.7908587257617729,
"grad_norm": 3.9156484603881836,
"learning_rate": 2.962275532457891e-05,
"loss": 0.3906,
"step": 5710
},
{
"epoch": 0.7922437673130194,
"grad_norm": 3.2225682735443115,
"learning_rate": 2.9604194700941953e-05,
"loss": 0.414,
"step": 5720
},
{
"epoch": 0.7936288088642659,
"grad_norm": 2.859682321548462,
"learning_rate": 2.9585634077305e-05,
"loss": 0.3411,
"step": 5730
},
{
"epoch": 0.7950138504155124,
"grad_norm": 2.8549094200134277,
"learning_rate": 2.9567073453668044e-05,
"loss": 0.3899,
"step": 5740
},
{
"epoch": 0.796398891966759,
"grad_norm": 4.423649787902832,
"learning_rate": 2.954851283003109e-05,
"loss": 0.4039,
"step": 5750
},
{
"epoch": 0.7977839335180056,
"grad_norm": 2.772611379623413,
"learning_rate": 2.9529952206394135e-05,
"loss": 0.4265,
"step": 5760
},
{
"epoch": 0.7991689750692521,
"grad_norm": 3.356656789779663,
"learning_rate": 2.9511391582757186e-05,
"loss": 0.3442,
"step": 5770
},
{
"epoch": 0.8005540166204986,
"grad_norm": 2.5400006771087646,
"learning_rate": 2.949283095912023e-05,
"loss": 0.3697,
"step": 5780
},
{
"epoch": 0.8019390581717452,
"grad_norm": 2.1723203659057617,
"learning_rate": 2.9474270335483273e-05,
"loss": 0.3727,
"step": 5790
},
{
"epoch": 0.8033240997229917,
"grad_norm": 5.687036514282227,
"learning_rate": 2.945570971184632e-05,
"loss": 0.4095,
"step": 5800
},
{
"epoch": 0.8047091412742382,
"grad_norm": 3.2231178283691406,
"learning_rate": 2.9437149088209364e-05,
"loss": 0.3397,
"step": 5810
},
{
"epoch": 0.8060941828254847,
"grad_norm": 2.743824005126953,
"learning_rate": 2.941858846457241e-05,
"loss": 0.3663,
"step": 5820
},
{
"epoch": 0.8074792243767313,
"grad_norm": 3.0033442974090576,
"learning_rate": 2.9400027840935455e-05,
"loss": 0.3799,
"step": 5830
},
{
"epoch": 0.8088642659279779,
"grad_norm": 3.7056267261505127,
"learning_rate": 2.9381467217298506e-05,
"loss": 0.388,
"step": 5840
},
{
"epoch": 0.8102493074792244,
"grad_norm": 4.837870121002197,
"learning_rate": 2.936290659366155e-05,
"loss": 0.4043,
"step": 5850
},
{
"epoch": 0.8116343490304709,
"grad_norm": 3.815831422805786,
"learning_rate": 2.9344345970024597e-05,
"loss": 0.34,
"step": 5860
},
{
"epoch": 0.8130193905817175,
"grad_norm": 3.2043607234954834,
"learning_rate": 2.932578534638764e-05,
"loss": 0.3182,
"step": 5870
},
{
"epoch": 0.814404432132964,
"grad_norm": 3.9446229934692383,
"learning_rate": 2.9307224722750688e-05,
"loss": 0.3658,
"step": 5880
},
{
"epoch": 0.8157894736842105,
"grad_norm": 1.8936973810195923,
"learning_rate": 2.9288664099113732e-05,
"loss": 0.3958,
"step": 5890
},
{
"epoch": 0.817174515235457,
"grad_norm": 4.075359344482422,
"learning_rate": 2.9270103475476776e-05,
"loss": 0.4343,
"step": 5900
},
{
"epoch": 0.8185595567867036,
"grad_norm": 3.6802127361297607,
"learning_rate": 2.9251542851839826e-05,
"loss": 0.3579,
"step": 5910
},
{
"epoch": 0.8199445983379502,
"grad_norm": 2.847928285598755,
"learning_rate": 2.923298222820287e-05,
"loss": 0.3709,
"step": 5920
},
{
"epoch": 0.8213296398891967,
"grad_norm": 3.1300485134124756,
"learning_rate": 2.9214421604565917e-05,
"loss": 0.3932,
"step": 5930
},
{
"epoch": 0.8227146814404432,
"grad_norm": 4.011430263519287,
"learning_rate": 2.919586098092896e-05,
"loss": 0.3944,
"step": 5940
},
{
"epoch": 0.8240997229916898,
"grad_norm": 2.8449504375457764,
"learning_rate": 2.917730035729201e-05,
"loss": 0.3857,
"step": 5950
},
{
"epoch": 0.8254847645429363,
"grad_norm": 2.396585464477539,
"learning_rate": 2.9158739733655052e-05,
"loss": 0.3609,
"step": 5960
},
{
"epoch": 0.8268698060941828,
"grad_norm": 2.1157963275909424,
"learning_rate": 2.91401791100181e-05,
"loss": 0.3388,
"step": 5970
},
{
"epoch": 0.8282548476454293,
"grad_norm": 4.279614448547363,
"learning_rate": 2.9121618486381143e-05,
"loss": 0.3645,
"step": 5980
},
{
"epoch": 0.8296398891966759,
"grad_norm": 2.4833781719207764,
"learning_rate": 2.9103057862744187e-05,
"loss": 0.3857,
"step": 5990
},
{
"epoch": 0.8310249307479224,
"grad_norm": 3.525597095489502,
"learning_rate": 2.9084497239107238e-05,
"loss": 0.3576,
"step": 6000
},
{
"epoch": 0.8313019390581717,
"eval_loss": 0.3741193115711212,
"eval_runtime": 1433.4277,
"eval_samples_per_second": 6.362,
"eval_steps_per_second": 0.795,
"step": 6002
},
{
"epoch": 0.832409972299169,
"grad_norm": 4.141083717346191,
"learning_rate": 2.906593661547028e-05,
"loss": 0.3799,
"step": 6010
},
{
"epoch": 0.8337950138504155,
"grad_norm": 5.583711624145508,
"learning_rate": 2.904737599183333e-05,
"loss": 0.393,
"step": 6020
},
{
"epoch": 0.8351800554016621,
"grad_norm": 2.304410457611084,
"learning_rate": 2.9028815368196372e-05,
"loss": 0.3457,
"step": 6030
},
{
"epoch": 0.8365650969529086,
"grad_norm": 2.9231860637664795,
"learning_rate": 2.901025474455942e-05,
"loss": 0.3795,
"step": 6040
},
{
"epoch": 0.8379501385041551,
"grad_norm": 5.1649980545043945,
"learning_rate": 2.8991694120922463e-05,
"loss": 0.3586,
"step": 6050
},
{
"epoch": 0.8393351800554016,
"grad_norm": 3.0985405445098877,
"learning_rate": 2.8973133497285514e-05,
"loss": 0.4043,
"step": 6060
},
{
"epoch": 0.8407202216066482,
"grad_norm": 3.0035006999969482,
"learning_rate": 2.8954572873648558e-05,
"loss": 0.3332,
"step": 6070
},
{
"epoch": 0.8421052631578947,
"grad_norm": 3.4762930870056152,
"learning_rate": 2.8936012250011605e-05,
"loss": 0.4091,
"step": 6080
},
{
"epoch": 0.8434903047091413,
"grad_norm": 5.735511302947998,
"learning_rate": 2.891745162637465e-05,
"loss": 0.3473,
"step": 6090
},
{
"epoch": 0.8448753462603878,
"grad_norm": 3.5551562309265137,
"learning_rate": 2.8898891002737693e-05,
"loss": 0.3818,
"step": 6100
},
{
"epoch": 0.8462603878116344,
"grad_norm": 3.491703510284424,
"learning_rate": 2.888033037910074e-05,
"loss": 0.3756,
"step": 6110
},
{
"epoch": 0.8476454293628809,
"grad_norm": 2.843029260635376,
"learning_rate": 2.8861769755463784e-05,
"loss": 0.3641,
"step": 6120
},
{
"epoch": 0.8490304709141274,
"grad_norm": 2.7495148181915283,
"learning_rate": 2.8843209131826834e-05,
"loss": 0.4011,
"step": 6130
},
{
"epoch": 0.850415512465374,
"grad_norm": 4.130334377288818,
"learning_rate": 2.8824648508189878e-05,
"loss": 0.3776,
"step": 6140
},
{
"epoch": 0.8518005540166205,
"grad_norm": 4.290791034698486,
"learning_rate": 2.8806087884552925e-05,
"loss": 0.4193,
"step": 6150
},
{
"epoch": 0.853185595567867,
"grad_norm": 2.533083438873291,
"learning_rate": 2.878752726091597e-05,
"loss": 0.3763,
"step": 6160
},
{
"epoch": 0.8545706371191135,
"grad_norm": 5.1947712898254395,
"learning_rate": 2.8768966637279016e-05,
"loss": 0.393,
"step": 6170
},
{
"epoch": 0.8559556786703602,
"grad_norm": 2.4162371158599854,
"learning_rate": 2.875040601364206e-05,
"loss": 0.4049,
"step": 6180
},
{
"epoch": 0.8573407202216067,
"grad_norm": 3.275009870529175,
"learning_rate": 2.8731845390005107e-05,
"loss": 0.3903,
"step": 6190
},
{
"epoch": 0.8587257617728532,
"grad_norm": 2.260618209838867,
"learning_rate": 2.871328476636815e-05,
"loss": 0.3721,
"step": 6200
},
{
"epoch": 0.8601108033240997,
"grad_norm": 2.923968553543091,
"learning_rate": 2.8694724142731195e-05,
"loss": 0.3749,
"step": 6210
},
{
"epoch": 0.8614958448753463,
"grad_norm": 5.046422481536865,
"learning_rate": 2.8676163519094245e-05,
"loss": 0.4056,
"step": 6220
},
{
"epoch": 0.8628808864265928,
"grad_norm": 3.8435168266296387,
"learning_rate": 2.865760289545729e-05,
"loss": 0.3755,
"step": 6230
},
{
"epoch": 0.8642659279778393,
"grad_norm": 3.553178548812866,
"learning_rate": 2.8639042271820336e-05,
"loss": 0.3926,
"step": 6240
},
{
"epoch": 0.8656509695290858,
"grad_norm": 3.8052849769592285,
"learning_rate": 2.862048164818338e-05,
"loss": 0.3685,
"step": 6250
},
{
"epoch": 0.8670360110803325,
"grad_norm": 2.288756847381592,
"learning_rate": 2.8601921024546427e-05,
"loss": 0.3499,
"step": 6260
},
{
"epoch": 0.868421052631579,
"grad_norm": 3.1977062225341797,
"learning_rate": 2.858336040090947e-05,
"loss": 0.3487,
"step": 6270
},
{
"epoch": 0.8698060941828255,
"grad_norm": 3.752891778945923,
"learning_rate": 2.8564799777272522e-05,
"loss": 0.3952,
"step": 6280
},
{
"epoch": 0.871191135734072,
"grad_norm": 2.5730514526367188,
"learning_rate": 2.8546239153635566e-05,
"loss": 0.349,
"step": 6290
},
{
"epoch": 0.8725761772853186,
"grad_norm": 3.132359027862549,
"learning_rate": 2.852767852999861e-05,
"loss": 0.3595,
"step": 6300
},
{
"epoch": 0.8739612188365651,
"grad_norm": 3.2165608406066895,
"learning_rate": 2.8509117906361657e-05,
"loss": 0.3478,
"step": 6310
},
{
"epoch": 0.8753462603878116,
"grad_norm": 10.091190338134766,
"learning_rate": 2.84905572827247e-05,
"loss": 0.3414,
"step": 6320
},
{
"epoch": 0.8767313019390581,
"grad_norm": 3.1347556114196777,
"learning_rate": 2.8471996659087748e-05,
"loss": 0.4414,
"step": 6330
},
{
"epoch": 0.8781163434903048,
"grad_norm": 2.5149002075195312,
"learning_rate": 2.845343603545079e-05,
"loss": 0.3524,
"step": 6340
},
{
"epoch": 0.8795013850415513,
"grad_norm": 3.1567630767822266,
"learning_rate": 2.843487541181384e-05,
"loss": 0.3568,
"step": 6350
},
{
"epoch": 0.8808864265927978,
"grad_norm": 3.1177406311035156,
"learning_rate": 2.8416314788176883e-05,
"loss": 0.3671,
"step": 6360
},
{
"epoch": 0.8822714681440443,
"grad_norm": 2.766374349594116,
"learning_rate": 2.8397754164539933e-05,
"loss": 0.3389,
"step": 6370
},
{
"epoch": 0.8836565096952909,
"grad_norm": 2.419781446456909,
"learning_rate": 2.8379193540902977e-05,
"loss": 0.3868,
"step": 6380
},
{
"epoch": 0.8850415512465374,
"grad_norm": 2.5221714973449707,
"learning_rate": 2.8360632917266024e-05,
"loss": 0.3744,
"step": 6390
},
{
"epoch": 0.8864265927977839,
"grad_norm": 4.218471050262451,
"learning_rate": 2.8342072293629068e-05,
"loss": 0.3245,
"step": 6400
},
{
"epoch": 0.8878116343490304,
"grad_norm": 2.45046067237854,
"learning_rate": 2.8323511669992112e-05,
"loss": 0.3801,
"step": 6410
},
{
"epoch": 0.889196675900277,
"grad_norm": 3.297358989715576,
"learning_rate": 2.830495104635516e-05,
"loss": 0.364,
"step": 6420
},
{
"epoch": 0.8905817174515236,
"grad_norm": 2.321702003479004,
"learning_rate": 2.8286390422718203e-05,
"loss": 0.3487,
"step": 6430
},
{
"epoch": 0.8919667590027701,
"grad_norm": 2.7762234210968018,
"learning_rate": 2.8267829799081253e-05,
"loss": 0.3723,
"step": 6440
},
{
"epoch": 0.8933518005540166,
"grad_norm": 2.534067392349243,
"learning_rate": 2.8249269175444297e-05,
"loss": 0.3825,
"step": 6450
},
{
"epoch": 0.8947368421052632,
"grad_norm": 6.5774078369140625,
"learning_rate": 2.8230708551807344e-05,
"loss": 0.43,
"step": 6460
},
{
"epoch": 0.8961218836565097,
"grad_norm": 2.7400996685028076,
"learning_rate": 2.8212147928170388e-05,
"loss": 0.3599,
"step": 6470
},
{
"epoch": 0.8975069252077562,
"grad_norm": 1.8518990278244019,
"learning_rate": 2.8193587304533435e-05,
"loss": 0.3228,
"step": 6480
},
{
"epoch": 0.8988919667590027,
"grad_norm": 2.071018695831299,
"learning_rate": 2.817502668089648e-05,
"loss": 0.3509,
"step": 6490
},
{
"epoch": 0.9002770083102493,
"grad_norm": 3.566608428955078,
"learning_rate": 2.815646605725953e-05,
"loss": 0.3726,
"step": 6500
},
{
"epoch": 0.9016620498614959,
"grad_norm": 2.569943904876709,
"learning_rate": 2.8137905433622574e-05,
"loss": 0.3811,
"step": 6510
},
{
"epoch": 0.9030470914127424,
"grad_norm": 3.897632598876953,
"learning_rate": 2.8119344809985617e-05,
"loss": 0.3855,
"step": 6520
},
{
"epoch": 0.9044321329639889,
"grad_norm": 2.6584584712982178,
"learning_rate": 2.8100784186348665e-05,
"loss": 0.3709,
"step": 6530
},
{
"epoch": 0.9058171745152355,
"grad_norm": 2.8664989471435547,
"learning_rate": 2.808222356271171e-05,
"loss": 0.3666,
"step": 6540
},
{
"epoch": 0.907202216066482,
"grad_norm": 2.7662582397460938,
"learning_rate": 2.8063662939074756e-05,
"loss": 0.3615,
"step": 6550
},
{
"epoch": 0.9085872576177285,
"grad_norm": 3.2466659545898438,
"learning_rate": 2.80451023154378e-05,
"loss": 0.4115,
"step": 6560
},
{
"epoch": 0.909972299168975,
"grad_norm": 2.0117952823638916,
"learning_rate": 2.8026541691800847e-05,
"loss": 0.336,
"step": 6570
},
{
"epoch": 0.9113573407202216,
"grad_norm": 3.4352803230285645,
"learning_rate": 2.800798106816389e-05,
"loss": 0.3656,
"step": 6580
},
{
"epoch": 0.9127423822714681,
"grad_norm": 2.4090380668640137,
"learning_rate": 2.798942044452694e-05,
"loss": 0.3774,
"step": 6590
},
{
"epoch": 0.9141274238227147,
"grad_norm": 4.36330509185791,
"learning_rate": 2.7970859820889985e-05,
"loss": 0.4315,
"step": 6600
},
{
"epoch": 0.9155124653739612,
"grad_norm": 3.1175436973571777,
"learning_rate": 2.7952299197253032e-05,
"loss": 0.3692,
"step": 6610
},
{
"epoch": 0.9168975069252078,
"grad_norm": 3.6826770305633545,
"learning_rate": 2.7933738573616076e-05,
"loss": 0.4001,
"step": 6620
},
{
"epoch": 0.9182825484764543,
"grad_norm": 2.450596570968628,
"learning_rate": 2.791517794997912e-05,
"loss": 0.3838,
"step": 6630
},
{
"epoch": 0.9196675900277008,
"grad_norm": 3.6701254844665527,
"learning_rate": 2.7896617326342167e-05,
"loss": 0.3145,
"step": 6640
},
{
"epoch": 0.9210526315789473,
"grad_norm": 3.4804341793060303,
"learning_rate": 2.787805670270521e-05,
"loss": 0.3894,
"step": 6650
},
{
"epoch": 0.9224376731301939,
"grad_norm": 3.159144163131714,
"learning_rate": 2.785949607906826e-05,
"loss": 0.338,
"step": 6660
},
{
"epoch": 0.9238227146814404,
"grad_norm": 4.329410076141357,
"learning_rate": 2.7840935455431305e-05,
"loss": 0.3899,
"step": 6670
},
{
"epoch": 0.925207756232687,
"grad_norm": 3.3670008182525635,
"learning_rate": 2.7822374831794352e-05,
"loss": 0.307,
"step": 6680
},
{
"epoch": 0.9265927977839336,
"grad_norm": 2.9940109252929688,
"learning_rate": 2.7803814208157396e-05,
"loss": 0.3342,
"step": 6690
},
{
"epoch": 0.9279778393351801,
"grad_norm": 3.1586687564849854,
"learning_rate": 2.7785253584520443e-05,
"loss": 0.3562,
"step": 6700
},
{
"epoch": 0.9293628808864266,
"grad_norm": 2.943342447280884,
"learning_rate": 2.7766692960883487e-05,
"loss": 0.3607,
"step": 6710
},
{
"epoch": 0.9307479224376731,
"grad_norm": 2.752495050430298,
"learning_rate": 2.774813233724653e-05,
"loss": 0.4183,
"step": 6720
},
{
"epoch": 0.9321329639889196,
"grad_norm": 4.039161205291748,
"learning_rate": 2.772957171360958e-05,
"loss": 0.3647,
"step": 6730
},
{
"epoch": 0.9335180055401662,
"grad_norm": 3.0139365196228027,
"learning_rate": 2.7711011089972625e-05,
"loss": 0.3389,
"step": 6740
},
{
"epoch": 0.9349030470914127,
"grad_norm": 3.619724750518799,
"learning_rate": 2.7692450466335672e-05,
"loss": 0.3462,
"step": 6750
},
{
"epoch": 0.9362880886426593,
"grad_norm": 1.989438772201538,
"learning_rate": 2.7673889842698716e-05,
"loss": 0.3869,
"step": 6760
},
{
"epoch": 0.9376731301939059,
"grad_norm": 2.1334989070892334,
"learning_rate": 2.7655329219061763e-05,
"loss": 0.317,
"step": 6770
},
{
"epoch": 0.9390581717451524,
"grad_norm": 3.1076736450195312,
"learning_rate": 2.7636768595424807e-05,
"loss": 0.3654,
"step": 6780
},
{
"epoch": 0.9404432132963989,
"grad_norm": 4.218199253082275,
"learning_rate": 2.7618207971787854e-05,
"loss": 0.356,
"step": 6790
},
{
"epoch": 0.9418282548476454,
"grad_norm": 3.333724021911621,
"learning_rate": 2.7599647348150898e-05,
"loss": 0.3675,
"step": 6800
},
{
"epoch": 0.943213296398892,
"grad_norm": 2.9643044471740723,
"learning_rate": 2.758108672451395e-05,
"loss": 0.3695,
"step": 6810
},
{
"epoch": 0.9445983379501385,
"grad_norm": 4.082902431488037,
"learning_rate": 2.7562526100876993e-05,
"loss": 0.3575,
"step": 6820
},
{
"epoch": 0.945983379501385,
"grad_norm": 2.4494361877441406,
"learning_rate": 2.7543965477240037e-05,
"loss": 0.3926,
"step": 6830
},
{
"epoch": 0.9473684210526315,
"grad_norm": 2.541417121887207,
"learning_rate": 2.7525404853603084e-05,
"loss": 0.3666,
"step": 6840
},
{
"epoch": 0.9487534626038782,
"grad_norm": 3.4122767448425293,
"learning_rate": 2.7506844229966128e-05,
"loss": 0.3842,
"step": 6850
},
{
"epoch": 0.9501385041551247,
"grad_norm": 4.269729137420654,
"learning_rate": 2.7488283606329175e-05,
"loss": 0.375,
"step": 6860
},
{
"epoch": 0.9515235457063712,
"grad_norm": 2.563281297683716,
"learning_rate": 2.746972298269222e-05,
"loss": 0.3856,
"step": 6870
},
{
"epoch": 0.9529085872576177,
"grad_norm": 3.160914182662964,
"learning_rate": 2.745116235905527e-05,
"loss": 0.3851,
"step": 6880
},
{
"epoch": 0.9542936288088643,
"grad_norm": 2.7195470333099365,
"learning_rate": 2.7432601735418313e-05,
"loss": 0.3667,
"step": 6890
},
{
"epoch": 0.9556786703601108,
"grad_norm": 2.226888418197632,
"learning_rate": 2.741404111178136e-05,
"loss": 0.397,
"step": 6900
},
{
"epoch": 0.9570637119113573,
"grad_norm": 2.3146603107452393,
"learning_rate": 2.7395480488144404e-05,
"loss": 0.3152,
"step": 6910
},
{
"epoch": 0.9584487534626038,
"grad_norm": 5.1704487800598145,
"learning_rate": 2.737691986450745e-05,
"loss": 0.389,
"step": 6920
},
{
"epoch": 0.9598337950138505,
"grad_norm": 3.0150206089019775,
"learning_rate": 2.7358359240870495e-05,
"loss": 0.3743,
"step": 6930
},
{
"epoch": 0.961218836565097,
"grad_norm": 4.35557222366333,
"learning_rate": 2.733979861723354e-05,
"loss": 0.3308,
"step": 6940
},
{
"epoch": 0.9626038781163435,
"grad_norm": 3.1160595417022705,
"learning_rate": 2.732123799359659e-05,
"loss": 0.3711,
"step": 6950
},
{
"epoch": 0.96398891966759,
"grad_norm": 2.697511672973633,
"learning_rate": 2.7302677369959633e-05,
"loss": 0.3848,
"step": 6960
},
{
"epoch": 0.9653739612188366,
"grad_norm": 2.6081149578094482,
"learning_rate": 2.728411674632268e-05,
"loss": 0.3419,
"step": 6970
},
{
"epoch": 0.9667590027700831,
"grad_norm": 3.462388753890991,
"learning_rate": 2.7265556122685724e-05,
"loss": 0.351,
"step": 6980
},
{
"epoch": 0.9681440443213296,
"grad_norm": 4.886613368988037,
"learning_rate": 2.724699549904877e-05,
"loss": 0.3263,
"step": 6990
},
{
"epoch": 0.9695290858725761,
"grad_norm": 3.7553138732910156,
"learning_rate": 2.7228434875411815e-05,
"loss": 0.3353,
"step": 7000
},
{
"epoch": 0.9709141274238227,
"grad_norm": 2.2478718757629395,
"learning_rate": 2.7209874251774862e-05,
"loss": 0.3363,
"step": 7010
},
{
"epoch": 0.9722991689750693,
"grad_norm": 2.6377060413360596,
"learning_rate": 2.7191313628137906e-05,
"loss": 0.3089,
"step": 7020
},
{
"epoch": 0.9736842105263158,
"grad_norm": 4.078857898712158,
"learning_rate": 2.717275300450095e-05,
"loss": 0.3259,
"step": 7030
},
{
"epoch": 0.9750692520775623,
"grad_norm": 4.050114154815674,
"learning_rate": 2.7154192380864e-05,
"loss": 0.3485,
"step": 7040
},
{
"epoch": 0.9764542936288089,
"grad_norm": 2.680589437484741,
"learning_rate": 2.7135631757227044e-05,
"loss": 0.3903,
"step": 7050
},
{
"epoch": 0.9778393351800554,
"grad_norm": 2.600092649459839,
"learning_rate": 2.711707113359009e-05,
"loss": 0.3716,
"step": 7060
},
{
"epoch": 0.9792243767313019,
"grad_norm": 2.405036211013794,
"learning_rate": 2.7098510509953135e-05,
"loss": 0.3366,
"step": 7070
},
{
"epoch": 0.9806094182825484,
"grad_norm": 2.041038751602173,
"learning_rate": 2.7079949886316183e-05,
"loss": 0.3859,
"step": 7080
},
{
"epoch": 0.981994459833795,
"grad_norm": 3.4083938598632812,
"learning_rate": 2.7061389262679226e-05,
"loss": 0.3576,
"step": 7090
},
{
"epoch": 0.9833795013850416,
"grad_norm": 2.4382524490356445,
"learning_rate": 2.7042828639042277e-05,
"loss": 0.3199,
"step": 7100
},
{
"epoch": 0.9847645429362881,
"grad_norm": 2.378704071044922,
"learning_rate": 2.702426801540532e-05,
"loss": 0.399,
"step": 7110
},
{
"epoch": 0.9861495844875346,
"grad_norm": 1.8466202020645142,
"learning_rate": 2.7005707391768368e-05,
"loss": 0.3673,
"step": 7120
},
{
"epoch": 0.9875346260387812,
"grad_norm": 2.051624059677124,
"learning_rate": 2.6987146768131412e-05,
"loss": 0.327,
"step": 7130
},
{
"epoch": 0.9889196675900277,
"grad_norm": 3.790463924407959,
"learning_rate": 2.6968586144494456e-05,
"loss": 0.3481,
"step": 7140
},
{
"epoch": 0.9903047091412742,
"grad_norm": 3.0214810371398926,
"learning_rate": 2.6950025520857503e-05,
"loss": 0.2776,
"step": 7150
},
{
"epoch": 0.9916897506925207,
"grad_norm": 3.270256519317627,
"learning_rate": 2.6931464897220547e-05,
"loss": 0.4097,
"step": 7160
},
{
"epoch": 0.9930747922437673,
"grad_norm": 1.8735462427139282,
"learning_rate": 2.6912904273583594e-05,
"loss": 0.3464,
"step": 7170
},
{
"epoch": 0.9944598337950139,
"grad_norm": 2.376981496810913,
"learning_rate": 2.6894343649946638e-05,
"loss": 0.2948,
"step": 7180
},
{
"epoch": 0.9958448753462604,
"grad_norm": 2.929786205291748,
"learning_rate": 2.6875783026309688e-05,
"loss": 0.3857,
"step": 7190
},
{
"epoch": 0.997229916897507,
"grad_norm": 3.4466960430145264,
"learning_rate": 2.6857222402672732e-05,
"loss": 0.4073,
"step": 7200
},
{
"epoch": 0.9986149584487535,
"grad_norm": 2.4240450859069824,
"learning_rate": 2.683866177903578e-05,
"loss": 0.3999,
"step": 7210
},
{
"epoch": 1.0,
"grad_norm": 4.055125713348389,
"learning_rate": 2.6820101155398823e-05,
"loss": 0.3633,
"step": 7220
},
{
"epoch": 1.0013850415512466,
"grad_norm": 2.4502010345458984,
"learning_rate": 2.680154053176187e-05,
"loss": 0.3134,
"step": 7230
},
{
"epoch": 1.002770083102493,
"grad_norm": 6.7965779304504395,
"learning_rate": 2.6782979908124914e-05,
"loss": 0.2852,
"step": 7240
},
{
"epoch": 1.0041551246537397,
"grad_norm": 3.1844305992126465,
"learning_rate": 2.6764419284487958e-05,
"loss": 0.2654,
"step": 7250
},
{
"epoch": 1.005540166204986,
"grad_norm": 3.839517831802368,
"learning_rate": 2.674585866085101e-05,
"loss": 0.3125,
"step": 7260
},
{
"epoch": 1.0069252077562327,
"grad_norm": 2.922276020050049,
"learning_rate": 2.6727298037214052e-05,
"loss": 0.3124,
"step": 7270
},
{
"epoch": 1.0083102493074791,
"grad_norm": 1.8956462144851685,
"learning_rate": 2.67087374135771e-05,
"loss": 0.3266,
"step": 7280
},
{
"epoch": 1.0096952908587258,
"grad_norm": 2.989438533782959,
"learning_rate": 2.6690176789940143e-05,
"loss": 0.3117,
"step": 7290
},
{
"epoch": 1.0110803324099722,
"grad_norm": 4.252190589904785,
"learning_rate": 2.667161616630319e-05,
"loss": 0.3084,
"step": 7300
},
{
"epoch": 1.0124653739612188,
"grad_norm": 2.619835138320923,
"learning_rate": 2.6653055542666234e-05,
"loss": 0.3015,
"step": 7310
},
{
"epoch": 1.0138504155124655,
"grad_norm": 3.276683807373047,
"learning_rate": 2.6634494919029285e-05,
"loss": 0.3145,
"step": 7320
},
{
"epoch": 1.0152354570637119,
"grad_norm": 1.749790906906128,
"learning_rate": 2.661593429539233e-05,
"loss": 0.291,
"step": 7330
},
{
"epoch": 1.0166204986149585,
"grad_norm": 3.3701796531677246,
"learning_rate": 2.6597373671755373e-05,
"loss": 0.3213,
"step": 7340
},
{
"epoch": 1.018005540166205,
"grad_norm": 2.0237927436828613,
"learning_rate": 2.657881304811842e-05,
"loss": 0.285,
"step": 7350
},
{
"epoch": 1.0193905817174516,
"grad_norm": 5.210159778594971,
"learning_rate": 2.6560252424481464e-05,
"loss": 0.3031,
"step": 7360
},
{
"epoch": 1.020775623268698,
"grad_norm": 3.3727283477783203,
"learning_rate": 2.654169180084451e-05,
"loss": 0.299,
"step": 7370
},
{
"epoch": 1.0221606648199446,
"grad_norm": 2.3973450660705566,
"learning_rate": 2.6523131177207555e-05,
"loss": 0.29,
"step": 7380
},
{
"epoch": 1.0235457063711912,
"grad_norm": 2.515382766723633,
"learning_rate": 2.6504570553570602e-05,
"loss": 0.2893,
"step": 7390
},
{
"epoch": 1.0249307479224377,
"grad_norm": 2.33882999420166,
"learning_rate": 2.6486009929933646e-05,
"loss": 0.2978,
"step": 7400
},
{
"epoch": 1.0263157894736843,
"grad_norm": 2.946920871734619,
"learning_rate": 2.6467449306296696e-05,
"loss": 0.2885,
"step": 7410
},
{
"epoch": 1.0277008310249307,
"grad_norm": 6.108001708984375,
"learning_rate": 2.644888868265974e-05,
"loss": 0.283,
"step": 7420
},
{
"epoch": 1.0290858725761773,
"grad_norm": 2.6035192012786865,
"learning_rate": 2.6430328059022787e-05,
"loss": 0.3212,
"step": 7430
},
{
"epoch": 1.0304709141274238,
"grad_norm": 2.987347364425659,
"learning_rate": 2.641176743538583e-05,
"loss": 0.2891,
"step": 7440
},
{
"epoch": 1.0318559556786704,
"grad_norm": 2.5733394622802734,
"learning_rate": 2.6393206811748875e-05,
"loss": 0.3054,
"step": 7450
},
{
"epoch": 1.0332409972299168,
"grad_norm": 3.5433621406555176,
"learning_rate": 2.6374646188111922e-05,
"loss": 0.3316,
"step": 7460
},
{
"epoch": 1.0346260387811634,
"grad_norm": 2.2597479820251465,
"learning_rate": 2.6356085564474966e-05,
"loss": 0.3214,
"step": 7470
},
{
"epoch": 1.03601108033241,
"grad_norm": 3.3408634662628174,
"learning_rate": 2.6337524940838016e-05,
"loss": 0.2911,
"step": 7480
},
{
"epoch": 1.0373961218836565,
"grad_norm": 2.973517417907715,
"learning_rate": 2.631896431720106e-05,
"loss": 0.263,
"step": 7490
},
{
"epoch": 1.0387811634349031,
"grad_norm": 2.121873378753662,
"learning_rate": 2.6300403693564107e-05,
"loss": 0.2777,
"step": 7500
},
{
"epoch": 1.0401662049861495,
"grad_norm": 1.9657741785049438,
"learning_rate": 2.628184306992715e-05,
"loss": 0.2595,
"step": 7510
},
{
"epoch": 1.0415512465373962,
"grad_norm": 2.234025716781616,
"learning_rate": 2.62632824462902e-05,
"loss": 0.2688,
"step": 7520
},
{
"epoch": 1.0429362880886426,
"grad_norm": 2.578979969024658,
"learning_rate": 2.6244721822653242e-05,
"loss": 0.3076,
"step": 7530
},
{
"epoch": 1.0443213296398892,
"grad_norm": 3.483440637588501,
"learning_rate": 2.6226161199016293e-05,
"loss": 0.2832,
"step": 7540
},
{
"epoch": 1.0457063711911356,
"grad_norm": 1.7430837154388428,
"learning_rate": 2.6207600575379337e-05,
"loss": 0.2926,
"step": 7550
},
{
"epoch": 1.0470914127423823,
"grad_norm": 3.583524703979492,
"learning_rate": 2.618903995174238e-05,
"loss": 0.3038,
"step": 7560
},
{
"epoch": 1.048476454293629,
"grad_norm": 2.33888840675354,
"learning_rate": 2.6170479328105428e-05,
"loss": 0.3276,
"step": 7570
},
{
"epoch": 1.0498614958448753,
"grad_norm": 2.268723249435425,
"learning_rate": 2.615191870446847e-05,
"loss": 0.2792,
"step": 7580
},
{
"epoch": 1.051246537396122,
"grad_norm": 2.320464611053467,
"learning_rate": 2.613335808083152e-05,
"loss": 0.2704,
"step": 7590
},
{
"epoch": 1.0526315789473684,
"grad_norm": 2.0587191581726074,
"learning_rate": 2.6114797457194562e-05,
"loss": 0.3529,
"step": 7600
},
{
"epoch": 1.054016620498615,
"grad_norm": 3.5844640731811523,
"learning_rate": 2.609623683355761e-05,
"loss": 0.3146,
"step": 7610
},
{
"epoch": 1.0554016620498614,
"grad_norm": 2.7052695751190186,
"learning_rate": 2.6077676209920653e-05,
"loss": 0.3062,
"step": 7620
},
{
"epoch": 1.056786703601108,
"grad_norm": 2.0099308490753174,
"learning_rate": 2.6059115586283704e-05,
"loss": 0.2721,
"step": 7630
},
{
"epoch": 1.0581717451523547,
"grad_norm": 2.6577095985412598,
"learning_rate": 2.6040554962646748e-05,
"loss": 0.3372,
"step": 7640
},
{
"epoch": 1.059556786703601,
"grad_norm": 5.141023635864258,
"learning_rate": 2.602199433900979e-05,
"loss": 0.2875,
"step": 7650
},
{
"epoch": 1.0609418282548477,
"grad_norm": 3.8370797634124756,
"learning_rate": 2.600343371537284e-05,
"loss": 0.2623,
"step": 7660
},
{
"epoch": 1.0623268698060941,
"grad_norm": 2.1279635429382324,
"learning_rate": 2.5984873091735883e-05,
"loss": 0.2965,
"step": 7670
},
{
"epoch": 1.0637119113573408,
"grad_norm": 3.3452510833740234,
"learning_rate": 2.596816853046263e-05,
"loss": 0.2858,
"step": 7680
},
{
"epoch": 1.0650969529085872,
"grad_norm": 1.935064435005188,
"learning_rate": 2.5949607906825673e-05,
"loss": 0.2622,
"step": 7690
},
{
"epoch": 1.0664819944598338,
"grad_norm": 3.9529714584350586,
"learning_rate": 2.593104728318872e-05,
"loss": 0.3025,
"step": 7700
},
{
"epoch": 1.0678670360110802,
"grad_norm": 4.040821075439453,
"learning_rate": 2.5912486659551764e-05,
"loss": 0.2804,
"step": 7710
},
{
"epoch": 1.0692520775623269,
"grad_norm": 2.2184488773345947,
"learning_rate": 2.5893926035914808e-05,
"loss": 0.2972,
"step": 7720
},
{
"epoch": 1.0706371191135735,
"grad_norm": 2.0147476196289062,
"learning_rate": 2.5875365412277855e-05,
"loss": 0.2688,
"step": 7730
},
{
"epoch": 1.07202216066482,
"grad_norm": 2.8058619499206543,
"learning_rate": 2.58568047886409e-05,
"loss": 0.2834,
"step": 7740
},
{
"epoch": 1.0734072022160666,
"grad_norm": 2.537014961242676,
"learning_rate": 2.5838244165003946e-05,
"loss": 0.3082,
"step": 7750
},
{
"epoch": 1.074792243767313,
"grad_norm": 2.190993309020996,
"learning_rate": 2.581968354136699e-05,
"loss": 0.279,
"step": 7760
},
{
"epoch": 1.0761772853185596,
"grad_norm": 2.4779722690582275,
"learning_rate": 2.580112291773004e-05,
"loss": 0.3169,
"step": 7770
},
{
"epoch": 1.077562326869806,
"grad_norm": 2.9042325019836426,
"learning_rate": 2.5782562294093084e-05,
"loss": 0.2668,
"step": 7780
},
{
"epoch": 1.0789473684210527,
"grad_norm": 3.5843074321746826,
"learning_rate": 2.576400167045613e-05,
"loss": 0.3598,
"step": 7790
},
{
"epoch": 1.080332409972299,
"grad_norm": 3.1596109867095947,
"learning_rate": 2.5745441046819175e-05,
"loss": 0.3084,
"step": 7800
},
{
"epoch": 1.0817174515235457,
"grad_norm": 2.0699515342712402,
"learning_rate": 2.572688042318222e-05,
"loss": 0.2557,
"step": 7810
},
{
"epoch": 1.0831024930747923,
"grad_norm": 2.1602230072021484,
"learning_rate": 2.5708319799545266e-05,
"loss": 0.3003,
"step": 7820
},
{
"epoch": 1.0844875346260388,
"grad_norm": 2.242274522781372,
"learning_rate": 2.568975917590831e-05,
"loss": 0.2886,
"step": 7830
},
{
"epoch": 1.0858725761772854,
"grad_norm": 2.3011515140533447,
"learning_rate": 2.567119855227136e-05,
"loss": 0.3011,
"step": 7840
},
{
"epoch": 1.0872576177285318,
"grad_norm": 2.657965898513794,
"learning_rate": 2.56544939909981e-05,
"loss": 0.3247,
"step": 7850
},
{
"epoch": 1.0886426592797784,
"grad_norm": 3.9587013721466064,
"learning_rate": 2.5635933367361144e-05,
"loss": 0.2618,
"step": 7860
},
{
"epoch": 1.0900277008310248,
"grad_norm": 3.764596939086914,
"learning_rate": 2.561737274372419e-05,
"loss": 0.3045,
"step": 7870
},
{
"epoch": 1.0914127423822715,
"grad_norm": 2.5493414402008057,
"learning_rate": 2.5598812120087235e-05,
"loss": 0.2534,
"step": 7880
},
{
"epoch": 1.0927977839335181,
"grad_norm": 2.782120704650879,
"learning_rate": 2.5580251496450285e-05,
"loss": 0.283,
"step": 7890
},
{
"epoch": 1.0941828254847645,
"grad_norm": 3.238389253616333,
"learning_rate": 2.5561690872813326e-05,
"loss": 0.3168,
"step": 7900
},
{
"epoch": 1.0955678670360112,
"grad_norm": 1.759173035621643,
"learning_rate": 2.5543130249176376e-05,
"loss": 0.2869,
"step": 7910
},
{
"epoch": 1.0969529085872576,
"grad_norm": 3.056239604949951,
"learning_rate": 2.552456962553942e-05,
"loss": 0.2956,
"step": 7920
},
{
"epoch": 1.0983379501385042,
"grad_norm": 2.110605478286743,
"learning_rate": 2.5506009001902467e-05,
"loss": 0.2692,
"step": 7930
},
{
"epoch": 1.0997229916897506,
"grad_norm": 2.7864487171173096,
"learning_rate": 2.548744837826551e-05,
"loss": 0.3191,
"step": 7940
},
{
"epoch": 1.1011080332409973,
"grad_norm": 4.533125400543213,
"learning_rate": 2.5468887754628558e-05,
"loss": 0.3144,
"step": 7950
},
{
"epoch": 1.1024930747922437,
"grad_norm": 4.58744478225708,
"learning_rate": 2.5450327130991602e-05,
"loss": 0.286,
"step": 7960
},
{
"epoch": 1.1038781163434903,
"grad_norm": 1.8122034072875977,
"learning_rate": 2.5431766507354646e-05,
"loss": 0.2732,
"step": 7970
},
{
"epoch": 1.1052631578947367,
"grad_norm": 2.5709145069122314,
"learning_rate": 2.5413205883717696e-05,
"loss": 0.2861,
"step": 7980
},
{
"epoch": 1.1066481994459834,
"grad_norm": 3.8308207988739014,
"learning_rate": 2.539464526008074e-05,
"loss": 0.2891,
"step": 7990
},
{
"epoch": 1.10803324099723,
"grad_norm": 3.6823465824127197,
"learning_rate": 2.5376084636443787e-05,
"loss": 0.2855,
"step": 8000
},
{
"epoch": 1.1094182825484764,
"grad_norm": 1.981024146080017,
"learning_rate": 2.535752401280683e-05,
"loss": 0.3265,
"step": 8010
},
{
"epoch": 1.110803324099723,
"grad_norm": 2.555572748184204,
"learning_rate": 2.533896338916988e-05,
"loss": 0.3057,
"step": 8020
},
{
"epoch": 1.1121883656509695,
"grad_norm": 2.066537380218506,
"learning_rate": 2.5320402765532922e-05,
"loss": 0.3195,
"step": 8030
},
{
"epoch": 1.113573407202216,
"grad_norm": 2.2335755825042725,
"learning_rate": 2.5301842141895973e-05,
"loss": 0.255,
"step": 8040
},
{
"epoch": 1.1149584487534625,
"grad_norm": 2.2410192489624023,
"learning_rate": 2.5283281518259017e-05,
"loss": 0.2554,
"step": 8050
},
{
"epoch": 1.1163434903047091,
"grad_norm": 3.2448580265045166,
"learning_rate": 2.5264720894622064e-05,
"loss": 0.2927,
"step": 8060
},
{
"epoch": 1.1177285318559558,
"grad_norm": 4.015766143798828,
"learning_rate": 2.5246160270985108e-05,
"loss": 0.3037,
"step": 8070
},
{
"epoch": 1.1191135734072022,
"grad_norm": 3.3745741844177246,
"learning_rate": 2.522759964734815e-05,
"loss": 0.3126,
"step": 8080
},
{
"epoch": 1.1204986149584488,
"grad_norm": 2.4334716796875,
"learning_rate": 2.52090390237112e-05,
"loss": 0.3142,
"step": 8090
},
{
"epoch": 1.1218836565096952,
"grad_norm": 2.36714243888855,
"learning_rate": 2.5190478400074242e-05,
"loss": 0.2496,
"step": 8100
},
{
"epoch": 1.1232686980609419,
"grad_norm": 1.657104730606079,
"learning_rate": 2.517191777643729e-05,
"loss": 0.3198,
"step": 8110
},
{
"epoch": 1.1246537396121883,
"grad_norm": 10.06810188293457,
"learning_rate": 2.5153357152800333e-05,
"loss": 0.3197,
"step": 8120
},
{
"epoch": 1.126038781163435,
"grad_norm": 3.1592838764190674,
"learning_rate": 2.5134796529163384e-05,
"loss": 0.3127,
"step": 8130
},
{
"epoch": 1.1274238227146816,
"grad_norm": 2.0332980155944824,
"learning_rate": 2.5116235905526428e-05,
"loss": 0.2899,
"step": 8140
},
{
"epoch": 1.128808864265928,
"grad_norm": 2.814194917678833,
"learning_rate": 2.5097675281889475e-05,
"loss": 0.3003,
"step": 8150
},
{
"epoch": 1.1301939058171746,
"grad_norm": 2.8768527507781982,
"learning_rate": 2.507911465825252e-05,
"loss": 0.318,
"step": 8160
},
{
"epoch": 1.131578947368421,
"grad_norm": 2.1469993591308594,
"learning_rate": 2.5060554034615566e-05,
"loss": 0.3028,
"step": 8170
},
{
"epoch": 1.1329639889196677,
"grad_norm": 1.9456562995910645,
"learning_rate": 2.504384947334231e-05,
"loss": 0.2919,
"step": 8180
},
{
"epoch": 1.134349030470914,
"grad_norm": 2.4310145378112793,
"learning_rate": 2.5025288849705353e-05,
"loss": 0.2333,
"step": 8190
},
{
"epoch": 1.1357340720221607,
"grad_norm": 3.5691587924957275,
"learning_rate": 2.50067282260684e-05,
"loss": 0.2492,
"step": 8200
},
{
"epoch": 1.1371191135734071,
"grad_norm": 2.5814597606658936,
"learning_rate": 2.4988167602431444e-05,
"loss": 0.2944,
"step": 8210
},
{
"epoch": 1.1385041551246537,
"grad_norm": 2.083385467529297,
"learning_rate": 2.496960697879449e-05,
"loss": 0.2617,
"step": 8220
},
{
"epoch": 1.1398891966759002,
"grad_norm": 3.2597713470458984,
"learning_rate": 2.4951046355157535e-05,
"loss": 0.2932,
"step": 8230
},
{
"epoch": 1.1412742382271468,
"grad_norm": 2.046088695526123,
"learning_rate": 2.493248573152058e-05,
"loss": 0.2742,
"step": 8240
},
{
"epoch": 1.1426592797783934,
"grad_norm": 2.5761780738830566,
"learning_rate": 2.491392510788363e-05,
"loss": 0.3342,
"step": 8250
},
{
"epoch": 1.1440443213296398,
"grad_norm": 2.987704277038574,
"learning_rate": 2.4895364484246673e-05,
"loss": 0.2775,
"step": 8260
},
{
"epoch": 1.1454293628808865,
"grad_norm": 1.9415042400360107,
"learning_rate": 2.487680386060972e-05,
"loss": 0.2547,
"step": 8270
},
{
"epoch": 1.146814404432133,
"grad_norm": 2.4116508960723877,
"learning_rate": 2.4858243236972764e-05,
"loss": 0.2978,
"step": 8280
},
{
"epoch": 1.1481994459833795,
"grad_norm": 4.54902458190918,
"learning_rate": 2.483968261333581e-05,
"loss": 0.3454,
"step": 8290
},
{
"epoch": 1.149584487534626,
"grad_norm": 1.8645448684692383,
"learning_rate": 2.4821121989698855e-05,
"loss": 0.2742,
"step": 8300
},
{
"epoch": 1.1509695290858726,
"grad_norm": 3.6969997882843018,
"learning_rate": 2.4802561366061902e-05,
"loss": 0.3105,
"step": 8310
},
{
"epoch": 1.1523545706371192,
"grad_norm": 4.505978107452393,
"learning_rate": 2.4784000742424946e-05,
"loss": 0.2696,
"step": 8320
},
{
"epoch": 1.1537396121883656,
"grad_norm": 3.0299696922302246,
"learning_rate": 2.476544011878799e-05,
"loss": 0.2728,
"step": 8330
},
{
"epoch": 1.1551246537396123,
"grad_norm": 22.56894874572754,
"learning_rate": 2.474687949515104e-05,
"loss": 0.3225,
"step": 8340
},
{
"epoch": 1.1565096952908587,
"grad_norm": 2.4208364486694336,
"learning_rate": 2.4728318871514084e-05,
"loss": 0.3178,
"step": 8350
},
{
"epoch": 1.1578947368421053,
"grad_norm": 2.0815048217773438,
"learning_rate": 2.470975824787713e-05,
"loss": 0.3118,
"step": 8360
},
{
"epoch": 1.1592797783933517,
"grad_norm": 2.7576022148132324,
"learning_rate": 2.4691197624240175e-05,
"loss": 0.3092,
"step": 8370
},
{
"epoch": 1.1606648199445984,
"grad_norm": 4.915147304534912,
"learning_rate": 2.4672637000603222e-05,
"loss": 0.3538,
"step": 8380
},
{
"epoch": 1.162049861495845,
"grad_norm": 3.7766640186309814,
"learning_rate": 2.4654076376966266e-05,
"loss": 0.2736,
"step": 8390
},
{
"epoch": 1.1634349030470914,
"grad_norm": 2.9332292079925537,
"learning_rate": 2.4635515753329317e-05,
"loss": 0.2785,
"step": 8400
},
{
"epoch": 1.164819944598338,
"grad_norm": 2.0695388317108154,
"learning_rate": 2.461695512969236e-05,
"loss": 0.2671,
"step": 8410
},
{
"epoch": 1.1662049861495845,
"grad_norm": 2.214833974838257,
"learning_rate": 2.4598394506055408e-05,
"loss": 0.2717,
"step": 8420
},
{
"epoch": 1.167590027700831,
"grad_norm": 2.94752836227417,
"learning_rate": 2.457983388241845e-05,
"loss": 0.277,
"step": 8430
},
{
"epoch": 1.1689750692520775,
"grad_norm": 2.258779764175415,
"learning_rate": 2.4561273258781495e-05,
"loss": 0.2858,
"step": 8440
},
{
"epoch": 1.1703601108033241,
"grad_norm": 3.7083449363708496,
"learning_rate": 2.4542712635144543e-05,
"loss": 0.2831,
"step": 8450
},
{
"epoch": 1.1717451523545706,
"grad_norm": 2.4679653644561768,
"learning_rate": 2.4524152011507586e-05,
"loss": 0.292,
"step": 8460
},
{
"epoch": 1.1731301939058172,
"grad_norm": 3.176544189453125,
"learning_rate": 2.4505591387870634e-05,
"loss": 0.3516,
"step": 8470
},
{
"epoch": 1.1745152354570636,
"grad_norm": 2.446927070617676,
"learning_rate": 2.4487030764233677e-05,
"loss": 0.2781,
"step": 8480
},
{
"epoch": 1.1759002770083102,
"grad_norm": 1.7947635650634766,
"learning_rate": 2.4468470140596728e-05,
"loss": 0.2913,
"step": 8490
},
{
"epoch": 1.1772853185595569,
"grad_norm": 2.4101195335388184,
"learning_rate": 2.4449909516959772e-05,
"loss": 0.3274,
"step": 8500
},
{
"epoch": 1.1786703601108033,
"grad_norm": 2.146688222885132,
"learning_rate": 2.443134889332282e-05,
"loss": 0.2721,
"step": 8510
},
{
"epoch": 1.18005540166205,
"grad_norm": 2.8058419227600098,
"learning_rate": 2.4412788269685863e-05,
"loss": 0.2937,
"step": 8520
},
{
"epoch": 1.1814404432132963,
"grad_norm": 7.315638065338135,
"learning_rate": 2.439422764604891e-05,
"loss": 0.2474,
"step": 8530
},
{
"epoch": 1.182825484764543,
"grad_norm": 3.7715487480163574,
"learning_rate": 2.4375667022411954e-05,
"loss": 0.308,
"step": 8540
},
{
"epoch": 1.1842105263157894,
"grad_norm": 2.942664861679077,
"learning_rate": 2.4357106398774998e-05,
"loss": 0.3335,
"step": 8550
},
{
"epoch": 1.185595567867036,
"grad_norm": 2.307800769805908,
"learning_rate": 2.4338545775138048e-05,
"loss": 0.2615,
"step": 8560
},
{
"epoch": 1.1869806094182827,
"grad_norm": 2.0695810317993164,
"learning_rate": 2.4319985151501092e-05,
"loss": 0.2943,
"step": 8570
},
{
"epoch": 1.188365650969529,
"grad_norm": 1.8298561573028564,
"learning_rate": 2.430142452786414e-05,
"loss": 0.3084,
"step": 8580
},
{
"epoch": 1.1897506925207757,
"grad_norm": 3.479372024536133,
"learning_rate": 2.4282863904227183e-05,
"loss": 0.3498,
"step": 8590
},
{
"epoch": 1.1911357340720221,
"grad_norm": 3.58359694480896,
"learning_rate": 2.426430328059023e-05,
"loss": 0.3179,
"step": 8600
},
{
"epoch": 1.1925207756232687,
"grad_norm": 2.570115327835083,
"learning_rate": 2.4245742656953274e-05,
"loss": 0.2642,
"step": 8610
},
{
"epoch": 1.1939058171745152,
"grad_norm": 2.4691929817199707,
"learning_rate": 2.4227182033316325e-05,
"loss": 0.2939,
"step": 8620
},
{
"epoch": 1.1952908587257618,
"grad_norm": 2.3441882133483887,
"learning_rate": 2.420862140967937e-05,
"loss": 0.2578,
"step": 8630
},
{
"epoch": 1.1966759002770084,
"grad_norm": 2.204817295074463,
"learning_rate": 2.4190060786042412e-05,
"loss": 0.2726,
"step": 8640
},
{
"epoch": 1.1980609418282548,
"grad_norm": 3.0930302143096924,
"learning_rate": 2.417150016240546e-05,
"loss": 0.2888,
"step": 8650
},
{
"epoch": 1.1994459833795015,
"grad_norm": 5.16555643081665,
"learning_rate": 2.4152939538768503e-05,
"loss": 0.3081,
"step": 8660
},
{
"epoch": 1.200831024930748,
"grad_norm": 2.1353518962860107,
"learning_rate": 2.413437891513155e-05,
"loss": 0.288,
"step": 8670
},
{
"epoch": 1.2022160664819945,
"grad_norm": 3.657902956008911,
"learning_rate": 2.4115818291494594e-05,
"loss": 0.366,
"step": 8680
},
{
"epoch": 1.203601108033241,
"grad_norm": 2.1335654258728027,
"learning_rate": 2.409725766785764e-05,
"loss": 0.3347,
"step": 8690
},
{
"epoch": 1.2049861495844876,
"grad_norm": 1.3963650465011597,
"learning_rate": 2.4078697044220685e-05,
"loss": 0.2674,
"step": 8700
},
{
"epoch": 1.206371191135734,
"grad_norm": 2.855625629425049,
"learning_rate": 2.4060136420583736e-05,
"loss": 0.2461,
"step": 8710
},
{
"epoch": 1.2077562326869806,
"grad_norm": 6.864447593688965,
"learning_rate": 2.404157579694678e-05,
"loss": 0.2974,
"step": 8720
},
{
"epoch": 1.209141274238227,
"grad_norm": 1.8673814535140991,
"learning_rate": 2.4023015173309827e-05,
"loss": 0.3228,
"step": 8730
},
{
"epoch": 1.2105263157894737,
"grad_norm": 2.104527235031128,
"learning_rate": 2.400445454967287e-05,
"loss": 0.2705,
"step": 8740
},
{
"epoch": 1.2119113573407203,
"grad_norm": 3.6864442825317383,
"learning_rate": 2.3985893926035915e-05,
"loss": 0.2893,
"step": 8750
},
{
"epoch": 1.2132963988919667,
"grad_norm": 2.519566774368286,
"learning_rate": 2.3967333302398962e-05,
"loss": 0.2942,
"step": 8760
},
{
"epoch": 1.2146814404432134,
"grad_norm": 2.556389093399048,
"learning_rate": 2.3948772678762006e-05,
"loss": 0.2899,
"step": 8770
},
{
"epoch": 1.2160664819944598,
"grad_norm": 3.0239813327789307,
"learning_rate": 2.3930212055125056e-05,
"loss": 0.2676,
"step": 8780
},
{
"epoch": 1.2174515235457064,
"grad_norm": 1.96883225440979,
"learning_rate": 2.39116514314881e-05,
"loss": 0.272,
"step": 8790
},
{
"epoch": 1.2188365650969528,
"grad_norm": 2.565373659133911,
"learning_rate": 2.3893090807851147e-05,
"loss": 0.2608,
"step": 8800
},
{
"epoch": 1.2202216066481995,
"grad_norm": 2.15969181060791,
"learning_rate": 2.387453018421419e-05,
"loss": 0.3213,
"step": 8810
},
{
"epoch": 1.221606648199446,
"grad_norm": 2.249484062194824,
"learning_rate": 2.3855969560577238e-05,
"loss": 0.2686,
"step": 8820
},
{
"epoch": 1.2229916897506925,
"grad_norm": 2.9420814514160156,
"learning_rate": 2.3837408936940282e-05,
"loss": 0.2769,
"step": 8830
},
{
"epoch": 1.2243767313019391,
"grad_norm": 3.2192189693450928,
"learning_rate": 2.3818848313303333e-05,
"loss": 0.3125,
"step": 8840
},
{
"epoch": 1.2257617728531855,
"grad_norm": 2.442768096923828,
"learning_rate": 2.3800287689666376e-05,
"loss": 0.283,
"step": 8850
},
{
"epoch": 1.2271468144044322,
"grad_norm": 1.6071429252624512,
"learning_rate": 2.378172706602942e-05,
"loss": 0.3011,
"step": 8860
},
{
"epoch": 1.2285318559556786,
"grad_norm": 3.044898748397827,
"learning_rate": 2.3763166442392467e-05,
"loss": 0.2661,
"step": 8870
},
{
"epoch": 1.2299168975069252,
"grad_norm": 2.5329463481903076,
"learning_rate": 2.374460581875551e-05,
"loss": 0.276,
"step": 8880
},
{
"epoch": 1.2313019390581719,
"grad_norm": 2.2597010135650635,
"learning_rate": 2.372604519511856e-05,
"loss": 0.2875,
"step": 8890
},
{
"epoch": 1.2326869806094183,
"grad_norm": 2.011082649230957,
"learning_rate": 2.3707484571481602e-05,
"loss": 0.3035,
"step": 8900
},
{
"epoch": 1.234072022160665,
"grad_norm": 2.701603412628174,
"learning_rate": 2.368892394784465e-05,
"loss": 0.2657,
"step": 8910
},
{
"epoch": 1.2354570637119113,
"grad_norm": 2.4791486263275146,
"learning_rate": 2.3670363324207693e-05,
"loss": 0.3118,
"step": 8920
},
{
"epoch": 1.236842105263158,
"grad_norm": 3.129568338394165,
"learning_rate": 2.3651802700570744e-05,
"loss": 0.3063,
"step": 8930
},
{
"epoch": 1.2382271468144044,
"grad_norm": 1.9781174659729004,
"learning_rate": 2.3633242076933788e-05,
"loss": 0.2864,
"step": 8940
},
{
"epoch": 1.239612188365651,
"grad_norm": 1.7614623308181763,
"learning_rate": 2.3614681453296835e-05,
"loss": 0.288,
"step": 8950
},
{
"epoch": 1.2409972299168974,
"grad_norm": 1.9567925930023193,
"learning_rate": 2.359612082965988e-05,
"loss": 0.2872,
"step": 8960
},
{
"epoch": 1.242382271468144,
"grad_norm": 2.3994760513305664,
"learning_rate": 2.3577560206022922e-05,
"loss": 0.3582,
"step": 8970
},
{
"epoch": 1.2437673130193905,
"grad_norm": 1.8160407543182373,
"learning_rate": 2.355899958238597e-05,
"loss": 0.2995,
"step": 8980
},
{
"epoch": 1.245152354570637,
"grad_norm": 2.4072775840759277,
"learning_rate": 2.3540438958749013e-05,
"loss": 0.273,
"step": 8990
},
{
"epoch": 1.2465373961218837,
"grad_norm": 2.576134204864502,
"learning_rate": 2.3521878335112064e-05,
"loss": 0.2805,
"step": 9000
},
{
"epoch": 1.2469529085872577,
"eval_loss": 0.32686564326286316,
"eval_runtime": 1418.8287,
"eval_samples_per_second": 6.428,
"eval_steps_per_second": 0.803,
"step": 9003
},
{
"epoch": 1.2479224376731302,
"grad_norm": 1.4296165704727173,
"learning_rate": 2.3503317711475108e-05,
"loss": 0.2445,
"step": 9010
},
{
"epoch": 1.2493074792243768,
"grad_norm": 2.6902916431427,
"learning_rate": 2.3484757087838155e-05,
"loss": 0.288,
"step": 9020
},
{
"epoch": 1.2506925207756232,
"grad_norm": 1.775841474533081,
"learning_rate": 2.34661964642012e-05,
"loss": 0.2803,
"step": 9030
},
{
"epoch": 1.2520775623268698,
"grad_norm": 3.9191651344299316,
"learning_rate": 2.3447635840564246e-05,
"loss": 0.3268,
"step": 9040
},
{
"epoch": 1.2534626038781163,
"grad_norm": 2.516214370727539,
"learning_rate": 2.342907521692729e-05,
"loss": 0.27,
"step": 9050
},
{
"epoch": 1.254847645429363,
"grad_norm": 2.2284133434295654,
"learning_rate": 2.3410514593290334e-05,
"loss": 0.2835,
"step": 9060
},
{
"epoch": 1.2562326869806095,
"grad_norm": 1.9421190023422241,
"learning_rate": 2.3391953969653384e-05,
"loss": 0.2742,
"step": 9070
},
{
"epoch": 1.257617728531856,
"grad_norm": 3.4131953716278076,
"learning_rate": 2.3373393346016425e-05,
"loss": 0.3056,
"step": 9080
},
{
"epoch": 1.2590027700831026,
"grad_norm": 2.111435890197754,
"learning_rate": 2.3354832722379475e-05,
"loss": 0.2678,
"step": 9090
},
{
"epoch": 1.260387811634349,
"grad_norm": 2.7209439277648926,
"learning_rate": 2.333627209874252e-05,
"loss": 0.2866,
"step": 9100
},
{
"epoch": 1.2617728531855956,
"grad_norm": 2.8803234100341797,
"learning_rate": 2.3317711475105566e-05,
"loss": 0.2763,
"step": 9110
},
{
"epoch": 1.263157894736842,
"grad_norm": 1.6891577243804932,
"learning_rate": 2.329915085146861e-05,
"loss": 0.2875,
"step": 9120
},
{
"epoch": 1.2645429362880887,
"grad_norm": 3.695586919784546,
"learning_rate": 2.3280590227831657e-05,
"loss": 0.2863,
"step": 9130
},
{
"epoch": 1.2659279778393353,
"grad_norm": 2.572779417037964,
"learning_rate": 2.32620296041947e-05,
"loss": 0.2841,
"step": 9140
},
{
"epoch": 1.2673130193905817,
"grad_norm": 2.819389581680298,
"learning_rate": 2.3243468980557752e-05,
"loss": 0.2626,
"step": 9150
},
{
"epoch": 1.2686980609418281,
"grad_norm": 1.8481063842773438,
"learning_rate": 2.3224908356920796e-05,
"loss": 0.2853,
"step": 9160
},
{
"epoch": 1.2700831024930748,
"grad_norm": 3.2458460330963135,
"learning_rate": 2.320634773328384e-05,
"loss": 0.286,
"step": 9170
},
{
"epoch": 1.2714681440443214,
"grad_norm": 7.590353488922119,
"learning_rate": 2.3187787109646887e-05,
"loss": 0.2914,
"step": 9180
},
{
"epoch": 1.2728531855955678,
"grad_norm": 2.576279401779175,
"learning_rate": 2.316922648600993e-05,
"loss": 0.2798,
"step": 9190
},
{
"epoch": 1.2742382271468145,
"grad_norm": 2.004206657409668,
"learning_rate": 2.3150665862372978e-05,
"loss": 0.2607,
"step": 9200
},
{
"epoch": 1.2756232686980609,
"grad_norm": 1.7435073852539062,
"learning_rate": 2.313210523873602e-05,
"loss": 0.297,
"step": 9210
},
{
"epoch": 1.2770083102493075,
"grad_norm": 2.3837873935699463,
"learning_rate": 2.3113544615099072e-05,
"loss": 0.2865,
"step": 9220
},
{
"epoch": 1.278393351800554,
"grad_norm": 2.763490676879883,
"learning_rate": 2.3094983991462116e-05,
"loss": 0.2951,
"step": 9230
},
{
"epoch": 1.2797783933518005,
"grad_norm": 2.5408225059509277,
"learning_rate": 2.3076423367825163e-05,
"loss": 0.2737,
"step": 9240
},
{
"epoch": 1.2811634349030472,
"grad_norm": 2.692471742630005,
"learning_rate": 2.3057862744188207e-05,
"loss": 0.2757,
"step": 9250
},
{
"epoch": 1.2825484764542936,
"grad_norm": 2.177374839782715,
"learning_rate": 2.3039302120551254e-05,
"loss": 0.2898,
"step": 9260
},
{
"epoch": 1.2839335180055402,
"grad_norm": 2.7410569190979004,
"learning_rate": 2.3020741496914298e-05,
"loss": 0.302,
"step": 9270
},
{
"epoch": 1.2853185595567866,
"grad_norm": 2.6148903369903564,
"learning_rate": 2.300218087327734e-05,
"loss": 0.3334,
"step": 9280
},
{
"epoch": 1.2867036011080333,
"grad_norm": 1.9452197551727295,
"learning_rate": 2.298362024964039e-05,
"loss": 0.2833,
"step": 9290
},
{
"epoch": 1.2880886426592797,
"grad_norm": 2.417820453643799,
"learning_rate": 2.2965059626003433e-05,
"loss": 0.2605,
"step": 9300
},
{
"epoch": 1.2894736842105263,
"grad_norm": 3.1312954425811768,
"learning_rate": 2.2946499002366483e-05,
"loss": 0.2887,
"step": 9310
},
{
"epoch": 1.290858725761773,
"grad_norm": 3.3138206005096436,
"learning_rate": 2.2927938378729527e-05,
"loss": 0.2811,
"step": 9320
},
{
"epoch": 1.2922437673130194,
"grad_norm": 2.579641342163086,
"learning_rate": 2.2909377755092574e-05,
"loss": 0.2698,
"step": 9330
},
{
"epoch": 1.293628808864266,
"grad_norm": 3.3723838329315186,
"learning_rate": 2.2890817131455618e-05,
"loss": 0.2913,
"step": 9340
},
{
"epoch": 1.2950138504155124,
"grad_norm": 3.1993815898895264,
"learning_rate": 2.2872256507818665e-05,
"loss": 0.2779,
"step": 9350
},
{
"epoch": 1.296398891966759,
"grad_norm": 3.524803876876831,
"learning_rate": 2.285369588418171e-05,
"loss": 0.3189,
"step": 9360
},
{
"epoch": 1.2977839335180055,
"grad_norm": 2.997694492340088,
"learning_rate": 2.2835135260544753e-05,
"loss": 0.2694,
"step": 9370
},
{
"epoch": 1.299168975069252,
"grad_norm": 2.596389055252075,
"learning_rate": 2.2816574636907803e-05,
"loss": 0.3072,
"step": 9380
},
{
"epoch": 1.3005540166204987,
"grad_norm": 1.9428213834762573,
"learning_rate": 2.2798014013270847e-05,
"loss": 0.278,
"step": 9390
},
{
"epoch": 1.3019390581717452,
"grad_norm": 1.9896929264068604,
"learning_rate": 2.2779453389633894e-05,
"loss": 0.2886,
"step": 9400
},
{
"epoch": 1.3033240997229916,
"grad_norm": 1.7544904947280884,
"learning_rate": 2.2760892765996938e-05,
"loss": 0.286,
"step": 9410
},
{
"epoch": 1.3047091412742382,
"grad_norm": 3.5407917499542236,
"learning_rate": 2.2742332142359985e-05,
"loss": 0.2905,
"step": 9420
},
{
"epoch": 1.3060941828254848,
"grad_norm": 2.418882131576538,
"learning_rate": 2.272377151872303e-05,
"loss": 0.2943,
"step": 9430
},
{
"epoch": 1.3074792243767313,
"grad_norm": 2.2245256900787354,
"learning_rate": 2.270521089508608e-05,
"loss": 0.2887,
"step": 9440
},
{
"epoch": 1.3088642659279779,
"grad_norm": 2.4972410202026367,
"learning_rate": 2.2686650271449124e-05,
"loss": 0.294,
"step": 9450
},
{
"epoch": 1.3102493074792243,
"grad_norm": 2.2846741676330566,
"learning_rate": 2.266808964781217e-05,
"loss": 0.265,
"step": 9460
},
{
"epoch": 1.311634349030471,
"grad_norm": 1.917407751083374,
"learning_rate": 2.2649529024175215e-05,
"loss": 0.2872,
"step": 9470
},
{
"epoch": 1.3130193905817173,
"grad_norm": 2.495126247406006,
"learning_rate": 2.263096840053826e-05,
"loss": 0.2577,
"step": 9480
},
{
"epoch": 1.314404432132964,
"grad_norm": 2.143667697906494,
"learning_rate": 2.2612407776901306e-05,
"loss": 0.2766,
"step": 9490
},
{
"epoch": 1.3157894736842106,
"grad_norm": 3.016167402267456,
"learning_rate": 2.259384715326435e-05,
"loss": 0.2724,
"step": 9500
},
{
"epoch": 1.317174515235457,
"grad_norm": 2.1139116287231445,
"learning_rate": 2.2575286529627397e-05,
"loss": 0.2746,
"step": 9510
},
{
"epoch": 1.3185595567867037,
"grad_norm": 1.5689263343811035,
"learning_rate": 2.255672590599044e-05,
"loss": 0.2304,
"step": 9520
},
{
"epoch": 1.31994459833795,
"grad_norm": 4.322932243347168,
"learning_rate": 2.253816528235349e-05,
"loss": 0.2583,
"step": 9530
},
{
"epoch": 1.3213296398891967,
"grad_norm": 2.0311527252197266,
"learning_rate": 2.2519604658716535e-05,
"loss": 0.2542,
"step": 9540
},
{
"epoch": 1.3227146814404431,
"grad_norm": 2.6098263263702393,
"learning_rate": 2.2501044035079582e-05,
"loss": 0.2703,
"step": 9550
},
{
"epoch": 1.3240997229916898,
"grad_norm": 1.897586464881897,
"learning_rate": 2.2482483411442626e-05,
"loss": 0.282,
"step": 9560
},
{
"epoch": 1.3254847645429364,
"grad_norm": 2.359265089035034,
"learning_rate": 2.2463922787805673e-05,
"loss": 0.2873,
"step": 9570
},
{
"epoch": 1.3268698060941828,
"grad_norm": 2.188415288925171,
"learning_rate": 2.2445362164168717e-05,
"loss": 0.2575,
"step": 9580
},
{
"epoch": 1.3282548476454292,
"grad_norm": 3.153799533843994,
"learning_rate": 2.242680154053176e-05,
"loss": 0.2411,
"step": 9590
},
{
"epoch": 1.3296398891966759,
"grad_norm": 1.8487639427185059,
"learning_rate": 2.240824091689481e-05,
"loss": 0.2386,
"step": 9600
},
{
"epoch": 1.3310249307479225,
"grad_norm": 2.406550407409668,
"learning_rate": 2.2389680293257855e-05,
"loss": 0.2674,
"step": 9610
},
{
"epoch": 1.332409972299169,
"grad_norm": 2.741635322570801,
"learning_rate": 2.2371119669620902e-05,
"loss": 0.2715,
"step": 9620
},
{
"epoch": 1.3337950138504155,
"grad_norm": 2.4122025966644287,
"learning_rate": 2.2352559045983946e-05,
"loss": 0.3105,
"step": 9630
},
{
"epoch": 1.3351800554016622,
"grad_norm": 2.6568686962127686,
"learning_rate": 2.2333998422346993e-05,
"loss": 0.3032,
"step": 9640
},
{
"epoch": 1.3365650969529086,
"grad_norm": 2.265321731567383,
"learning_rate": 2.2315437798710037e-05,
"loss": 0.2754,
"step": 9650
},
{
"epoch": 1.337950138504155,
"grad_norm": 3.134011745452881,
"learning_rate": 2.2296877175073088e-05,
"loss": 0.2874,
"step": 9660
},
{
"epoch": 1.3393351800554016,
"grad_norm": 2.3763790130615234,
"learning_rate": 2.227831655143613e-05,
"loss": 0.2227,
"step": 9670
},
{
"epoch": 1.3407202216066483,
"grad_norm": 1.92081880569458,
"learning_rate": 2.2259755927799175e-05,
"loss": 0.2886,
"step": 9680
},
{
"epoch": 1.3421052631578947,
"grad_norm": 3.245089054107666,
"learning_rate": 2.2241195304162223e-05,
"loss": 0.2795,
"step": 9690
},
{
"epoch": 1.3434903047091413,
"grad_norm": 2.542462110519409,
"learning_rate": 2.2222634680525266e-05,
"loss": 0.2814,
"step": 9700
},
{
"epoch": 1.3448753462603877,
"grad_norm": 2.3733303546905518,
"learning_rate": 2.2204074056888314e-05,
"loss": 0.2629,
"step": 9710
},
{
"epoch": 1.3462603878116344,
"grad_norm": 3.7426581382751465,
"learning_rate": 2.2185513433251357e-05,
"loss": 0.334,
"step": 9720
},
{
"epoch": 1.3476454293628808,
"grad_norm": 3.181501626968384,
"learning_rate": 2.2166952809614405e-05,
"loss": 0.2797,
"step": 9730
},
{
"epoch": 1.3490304709141274,
"grad_norm": 2.654956817626953,
"learning_rate": 2.214839218597745e-05,
"loss": 0.3192,
"step": 9740
},
{
"epoch": 1.350415512465374,
"grad_norm": 1.8692598342895508,
"learning_rate": 2.21298315623405e-05,
"loss": 0.2282,
"step": 9750
},
{
"epoch": 1.3518005540166205,
"grad_norm": 2.027109384536743,
"learning_rate": 2.2111270938703543e-05,
"loss": 0.3065,
"step": 9760
},
{
"epoch": 1.353185595567867,
"grad_norm": 1.7037060260772705,
"learning_rate": 2.209271031506659e-05,
"loss": 0.2624,
"step": 9770
},
{
"epoch": 1.3545706371191135,
"grad_norm": 2.3476176261901855,
"learning_rate": 2.2074149691429634e-05,
"loss": 0.2572,
"step": 9780
},
{
"epoch": 1.3559556786703602,
"grad_norm": 3.0191843509674072,
"learning_rate": 2.2055589067792678e-05,
"loss": 0.275,
"step": 9790
},
{
"epoch": 1.3573407202216066,
"grad_norm": 2.4362123012542725,
"learning_rate": 2.2037028444155725e-05,
"loss": 0.2981,
"step": 9800
},
{
"epoch": 1.3587257617728532,
"grad_norm": 1.9792042970657349,
"learning_rate": 2.201846782051877e-05,
"loss": 0.2992,
"step": 9810
},
{
"epoch": 1.3601108033240998,
"grad_norm": 3.2633731365203857,
"learning_rate": 2.199990719688182e-05,
"loss": 0.329,
"step": 9820
},
{
"epoch": 1.3614958448753463,
"grad_norm": 2.1251308917999268,
"learning_rate": 2.1981346573244863e-05,
"loss": 0.2727,
"step": 9830
},
{
"epoch": 1.3628808864265927,
"grad_norm": 1.8353410959243774,
"learning_rate": 2.196278594960791e-05,
"loss": 0.2765,
"step": 9840
},
{
"epoch": 1.3642659279778393,
"grad_norm": 3.0422475337982178,
"learning_rate": 2.1944225325970954e-05,
"loss": 0.2846,
"step": 9850
},
{
"epoch": 1.365650969529086,
"grad_norm": 2.062798023223877,
"learning_rate": 2.1925664702334e-05,
"loss": 0.2542,
"step": 9860
},
{
"epoch": 1.3670360110803323,
"grad_norm": 1.8410342931747437,
"learning_rate": 2.1907104078697045e-05,
"loss": 0.2511,
"step": 9870
},
{
"epoch": 1.368421052631579,
"grad_norm": 2.4124655723571777,
"learning_rate": 2.1888543455060092e-05,
"loss": 0.3129,
"step": 9880
},
{
"epoch": 1.3698060941828256,
"grad_norm": 2.5593621730804443,
"learning_rate": 2.1869982831423136e-05,
"loss": 0.2813,
"step": 9890
},
{
"epoch": 1.371191135734072,
"grad_norm": 1.6741851568222046,
"learning_rate": 2.185142220778618e-05,
"loss": 0.3063,
"step": 9900
},
{
"epoch": 1.3725761772853184,
"grad_norm": 4.773355007171631,
"learning_rate": 2.183286158414923e-05,
"loss": 0.2506,
"step": 9910
},
{
"epoch": 1.373961218836565,
"grad_norm": 2.980382204055786,
"learning_rate": 2.1814300960512274e-05,
"loss": 0.3116,
"step": 9920
},
{
"epoch": 1.3753462603878117,
"grad_norm": 2.324427366256714,
"learning_rate": 2.179574033687532e-05,
"loss": 0.2479,
"step": 9930
},
{
"epoch": 1.3767313019390581,
"grad_norm": 2.517643451690674,
"learning_rate": 2.1777179713238365e-05,
"loss": 0.2692,
"step": 9940
},
{
"epoch": 1.3781163434903048,
"grad_norm": 2.677506923675537,
"learning_rate": 2.1758619089601412e-05,
"loss": 0.2807,
"step": 9950
},
{
"epoch": 1.3795013850415512,
"grad_norm": 2.6422669887542725,
"learning_rate": 2.1740058465964456e-05,
"loss": 0.2816,
"step": 9960
},
{
"epoch": 1.3808864265927978,
"grad_norm": 2.5744709968566895,
"learning_rate": 2.1721497842327507e-05,
"loss": 0.2577,
"step": 9970
},
{
"epoch": 1.3822714681440442,
"grad_norm": 1.9916622638702393,
"learning_rate": 2.170293721869055e-05,
"loss": 0.2952,
"step": 9980
},
{
"epoch": 1.3836565096952909,
"grad_norm": 3.8083372116088867,
"learning_rate": 2.1684376595053595e-05,
"loss": 0.3098,
"step": 9990
},
{
"epoch": 1.3850415512465375,
"grad_norm": 2.3398289680480957,
"learning_rate": 2.1665815971416642e-05,
"loss": 0.2562,
"step": 10000
},
{
"epoch": 1.386426592797784,
"grad_norm": 2.8088276386260986,
"learning_rate": 2.1647255347779686e-05,
"loss": 0.29,
"step": 10010
},
{
"epoch": 1.3878116343490305,
"grad_norm": 2.151501178741455,
"learning_rate": 2.1628694724142733e-05,
"loss": 0.3037,
"step": 10020
},
{
"epoch": 1.389196675900277,
"grad_norm": 1.9896818399429321,
"learning_rate": 2.1610134100505777e-05,
"loss": 0.296,
"step": 10030
},
{
"epoch": 1.3905817174515236,
"grad_norm": 2.18941330909729,
"learning_rate": 2.1591573476868827e-05,
"loss": 0.275,
"step": 10040
},
{
"epoch": 1.39196675900277,
"grad_norm": 2.6739301681518555,
"learning_rate": 2.157301285323187e-05,
"loss": 0.2596,
"step": 10050
},
{
"epoch": 1.3933518005540166,
"grad_norm": 2.3680918216705322,
"learning_rate": 2.1554452229594918e-05,
"loss": 0.3076,
"step": 10060
},
{
"epoch": 1.3947368421052633,
"grad_norm": 2.2901291847229004,
"learning_rate": 2.1535891605957962e-05,
"loss": 0.2879,
"step": 10070
},
{
"epoch": 1.3961218836565097,
"grad_norm": 2.8573126792907715,
"learning_rate": 2.151733098232101e-05,
"loss": 0.2727,
"step": 10080
},
{
"epoch": 1.397506925207756,
"grad_norm": 2.1237952709198,
"learning_rate": 2.1498770358684053e-05,
"loss": 0.2954,
"step": 10090
},
{
"epoch": 1.3988919667590027,
"grad_norm": 2.6920433044433594,
"learning_rate": 2.1480209735047097e-05,
"loss": 0.2848,
"step": 10100
},
{
"epoch": 1.4002770083102494,
"grad_norm": 2.2819883823394775,
"learning_rate": 2.1461649111410144e-05,
"loss": 0.2435,
"step": 10110
},
{
"epoch": 1.4016620498614958,
"grad_norm": 1.8151028156280518,
"learning_rate": 2.1443088487773188e-05,
"loss": 0.255,
"step": 10120
},
{
"epoch": 1.4030470914127424,
"grad_norm": 3.254143714904785,
"learning_rate": 2.142452786413624e-05,
"loss": 0.2825,
"step": 10130
},
{
"epoch": 1.404432132963989,
"grad_norm": 2.8257339000701904,
"learning_rate": 2.1405967240499282e-05,
"loss": 0.2738,
"step": 10140
},
{
"epoch": 1.4058171745152355,
"grad_norm": 2.7441883087158203,
"learning_rate": 2.138740661686233e-05,
"loss": 0.2641,
"step": 10150
},
{
"epoch": 1.4072022160664819,
"grad_norm": 2.6015915870666504,
"learning_rate": 2.1368845993225373e-05,
"loss": 0.2996,
"step": 10160
},
{
"epoch": 1.4085872576177285,
"grad_norm": 2.5906431674957275,
"learning_rate": 2.1352141431952113e-05,
"loss": 0.2535,
"step": 10170
},
{
"epoch": 1.4099722991689752,
"grad_norm": 2.5820693969726562,
"learning_rate": 2.1333580808315163e-05,
"loss": 0.2971,
"step": 10180
},
{
"epoch": 1.4113573407202216,
"grad_norm": 1.439150094985962,
"learning_rate": 2.1315020184678207e-05,
"loss": 0.3055,
"step": 10190
},
{
"epoch": 1.4127423822714682,
"grad_norm": 2.544900894165039,
"learning_rate": 2.1296459561041254e-05,
"loss": 0.2866,
"step": 10200
},
{
"epoch": 1.4141274238227146,
"grad_norm": 3.0438737869262695,
"learning_rate": 2.1277898937404298e-05,
"loss": 0.2676,
"step": 10210
},
{
"epoch": 1.4155124653739612,
"grad_norm": 1.7316502332687378,
"learning_rate": 2.1259338313767345e-05,
"loss": 0.261,
"step": 10220
},
{
"epoch": 1.4168975069252077,
"grad_norm": 2.2577292919158936,
"learning_rate": 2.124077769013039e-05,
"loss": 0.2667,
"step": 10230
},
{
"epoch": 1.4182825484764543,
"grad_norm": 1.731268048286438,
"learning_rate": 2.122221706649344e-05,
"loss": 0.3069,
"step": 10240
},
{
"epoch": 1.419667590027701,
"grad_norm": 3.460369825363159,
"learning_rate": 2.120365644285648e-05,
"loss": 0.2672,
"step": 10250
},
{
"epoch": 1.4210526315789473,
"grad_norm": 1.6917779445648193,
"learning_rate": 2.1185095819219524e-05,
"loss": 0.2759,
"step": 10260
},
{
"epoch": 1.422437673130194,
"grad_norm": 2.418710708618164,
"learning_rate": 2.1166535195582574e-05,
"loss": 0.3133,
"step": 10270
},
{
"epoch": 1.4238227146814404,
"grad_norm": 2.533979892730713,
"learning_rate": 2.1147974571945618e-05,
"loss": 0.2795,
"step": 10280
},
{
"epoch": 1.425207756232687,
"grad_norm": 3.7644662857055664,
"learning_rate": 2.1129413948308665e-05,
"loss": 0.3077,
"step": 10290
},
{
"epoch": 1.4265927977839334,
"grad_norm": 2.7289891242980957,
"learning_rate": 2.111085332467171e-05,
"loss": 0.2475,
"step": 10300
},
{
"epoch": 1.42797783933518,
"grad_norm": 2.3104724884033203,
"learning_rate": 2.1092292701034756e-05,
"loss": 0.2295,
"step": 10310
},
{
"epoch": 1.4293628808864267,
"grad_norm": 1.9208799600601196,
"learning_rate": 2.10737320773978e-05,
"loss": 0.2759,
"step": 10320
},
{
"epoch": 1.4307479224376731,
"grad_norm": 2.4903857707977295,
"learning_rate": 2.105517145376085e-05,
"loss": 0.2779,
"step": 10330
},
{
"epoch": 1.4321329639889195,
"grad_norm": 2.3281075954437256,
"learning_rate": 2.1036610830123895e-05,
"loss": 0.2968,
"step": 10340
},
{
"epoch": 1.4335180055401662,
"grad_norm": 2.194763660430908,
"learning_rate": 2.1018050206486942e-05,
"loss": 0.2857,
"step": 10350
},
{
"epoch": 1.4349030470914128,
"grad_norm": 3.951835870742798,
"learning_rate": 2.0999489582849986e-05,
"loss": 0.2627,
"step": 10360
},
{
"epoch": 1.4362880886426592,
"grad_norm": 4.016772747039795,
"learning_rate": 2.098092895921303e-05,
"loss": 0.2799,
"step": 10370
},
{
"epoch": 1.4376731301939059,
"grad_norm": 2.1924023628234863,
"learning_rate": 2.0962368335576077e-05,
"loss": 0.311,
"step": 10380
},
{
"epoch": 1.4390581717451525,
"grad_norm": 2.053593635559082,
"learning_rate": 2.094380771193912e-05,
"loss": 0.2654,
"step": 10390
},
{
"epoch": 1.440443213296399,
"grad_norm": 1.9551750421524048,
"learning_rate": 2.092524708830217e-05,
"loss": 0.2732,
"step": 10400
},
{
"epoch": 1.4418282548476453,
"grad_norm": 2.2008869647979736,
"learning_rate": 2.0906686464665215e-05,
"loss": 0.2788,
"step": 10410
},
{
"epoch": 1.443213296398892,
"grad_norm": 2.2866125106811523,
"learning_rate": 2.0888125841028262e-05,
"loss": 0.2468,
"step": 10420
},
{
"epoch": 1.4445983379501386,
"grad_norm": 2.323117733001709,
"learning_rate": 2.0869565217391306e-05,
"loss": 0.2816,
"step": 10430
},
{
"epoch": 1.445983379501385,
"grad_norm": 2.459951639175415,
"learning_rate": 2.0851004593754353e-05,
"loss": 0.2886,
"step": 10440
},
{
"epoch": 1.4473684210526316,
"grad_norm": 2.0339560508728027,
"learning_rate": 2.0832443970117397e-05,
"loss": 0.259,
"step": 10450
},
{
"epoch": 1.448753462603878,
"grad_norm": 2.2099788188934326,
"learning_rate": 2.081388334648044e-05,
"loss": 0.2358,
"step": 10460
},
{
"epoch": 1.4501385041551247,
"grad_norm": 2.0454177856445312,
"learning_rate": 2.0795322722843488e-05,
"loss": 0.2324,
"step": 10470
},
{
"epoch": 1.451523545706371,
"grad_norm": 1.9629887342453003,
"learning_rate": 2.0776762099206532e-05,
"loss": 0.2338,
"step": 10480
},
{
"epoch": 1.4529085872576177,
"grad_norm": 2.8075554370880127,
"learning_rate": 2.0758201475569582e-05,
"loss": 0.2898,
"step": 10490
},
{
"epoch": 1.4542936288088644,
"grad_norm": 1.5575898885726929,
"learning_rate": 2.0739640851932626e-05,
"loss": 0.2811,
"step": 10500
},
{
"epoch": 1.4556786703601108,
"grad_norm": 1.6579359769821167,
"learning_rate": 2.0721080228295673e-05,
"loss": 0.2391,
"step": 10510
},
{
"epoch": 1.4570637119113574,
"grad_norm": 2.505563259124756,
"learning_rate": 2.0702519604658717e-05,
"loss": 0.2564,
"step": 10520
},
{
"epoch": 1.4584487534626038,
"grad_norm": 2.028754949569702,
"learning_rate": 2.0683958981021764e-05,
"loss": 0.2855,
"step": 10530
},
{
"epoch": 1.4598337950138505,
"grad_norm": 2.3444249629974365,
"learning_rate": 2.0665398357384808e-05,
"loss": 0.2575,
"step": 10540
},
{
"epoch": 1.4612188365650969,
"grad_norm": 1.9929412603378296,
"learning_rate": 2.064683773374786e-05,
"loss": 0.2481,
"step": 10550
},
{
"epoch": 1.4626038781163435,
"grad_norm": 4.5691094398498535,
"learning_rate": 2.0628277110110903e-05,
"loss": 0.2479,
"step": 10560
},
{
"epoch": 1.4639889196675901,
"grad_norm": 3.3610942363739014,
"learning_rate": 2.0609716486473946e-05,
"loss": 0.3265,
"step": 10570
},
{
"epoch": 1.4653739612188366,
"grad_norm": 2.8776373863220215,
"learning_rate": 2.0591155862836994e-05,
"loss": 0.2893,
"step": 10580
},
{
"epoch": 1.466759002770083,
"grad_norm": 3.1740386486053467,
"learning_rate": 2.0572595239200037e-05,
"loss": 0.2574,
"step": 10590
},
{
"epoch": 1.4681440443213296,
"grad_norm": 2.4539008140563965,
"learning_rate": 2.0554034615563085e-05,
"loss": 0.3127,
"step": 10600
},
{
"epoch": 1.4695290858725762,
"grad_norm": 1.850730538368225,
"learning_rate": 2.053547399192613e-05,
"loss": 0.2471,
"step": 10610
},
{
"epoch": 1.4709141274238227,
"grad_norm": 6.172549724578857,
"learning_rate": 2.051691336828918e-05,
"loss": 0.2555,
"step": 10620
},
{
"epoch": 1.4722991689750693,
"grad_norm": 1.7516671419143677,
"learning_rate": 2.0498352744652223e-05,
"loss": 0.225,
"step": 10630
},
{
"epoch": 1.4736842105263157,
"grad_norm": 3.782275915145874,
"learning_rate": 2.047979212101527e-05,
"loss": 0.2735,
"step": 10640
},
{
"epoch": 1.4750692520775623,
"grad_norm": 4.208226680755615,
"learning_rate": 2.0461231497378314e-05,
"loss": 0.2755,
"step": 10650
},
{
"epoch": 1.4764542936288088,
"grad_norm": 2.2313504219055176,
"learning_rate": 2.044267087374136e-05,
"loss": 0.2913,
"step": 10660
},
{
"epoch": 1.4778393351800554,
"grad_norm": 1.933903694152832,
"learning_rate": 2.0424110250104405e-05,
"loss": 0.2789,
"step": 10670
},
{
"epoch": 1.479224376731302,
"grad_norm": 3.100245952606201,
"learning_rate": 2.040554962646745e-05,
"loss": 0.2594,
"step": 10680
},
{
"epoch": 1.4806094182825484,
"grad_norm": 5.2458953857421875,
"learning_rate": 2.0386989002830496e-05,
"loss": 0.2565,
"step": 10690
},
{
"epoch": 1.481994459833795,
"grad_norm": 2.68757963180542,
"learning_rate": 2.036842837919354e-05,
"loss": 0.2794,
"step": 10700
},
{
"epoch": 1.4833795013850415,
"grad_norm": 2.6526389122009277,
"learning_rate": 2.034986775555659e-05,
"loss": 0.2529,
"step": 10710
},
{
"epoch": 1.4847645429362881,
"grad_norm": 2.440277338027954,
"learning_rate": 2.0331307131919634e-05,
"loss": 0.2684,
"step": 10720
},
{
"epoch": 1.4861495844875345,
"grad_norm": 3.8751096725463867,
"learning_rate": 2.031274650828268e-05,
"loss": 0.2469,
"step": 10730
},
{
"epoch": 1.4875346260387812,
"grad_norm": 1.865172266960144,
"learning_rate": 2.0294185884645725e-05,
"loss": 0.3031,
"step": 10740
},
{
"epoch": 1.4889196675900278,
"grad_norm": 2.3093738555908203,
"learning_rate": 2.0275625261008772e-05,
"loss": 0.2806,
"step": 10750
},
{
"epoch": 1.4903047091412742,
"grad_norm": 3.2070140838623047,
"learning_rate": 2.0257064637371816e-05,
"loss": 0.2778,
"step": 10760
},
{
"epoch": 1.4916897506925209,
"grad_norm": 2.056748151779175,
"learning_rate": 2.023850401373486e-05,
"loss": 0.2943,
"step": 10770
},
{
"epoch": 1.4930747922437673,
"grad_norm": 1.8449196815490723,
"learning_rate": 2.021994339009791e-05,
"loss": 0.2767,
"step": 10780
},
{
"epoch": 1.494459833795014,
"grad_norm": 2.679731607437134,
"learning_rate": 2.0201382766460954e-05,
"loss": 0.3003,
"step": 10790
},
{
"epoch": 1.4958448753462603,
"grad_norm": 4.13687801361084,
"learning_rate": 2.0182822142824e-05,
"loss": 0.279,
"step": 10800
},
{
"epoch": 1.497229916897507,
"grad_norm": 1.9377161264419556,
"learning_rate": 2.016611758155074e-05,
"loss": 0.3019,
"step": 10810
},
{
"epoch": 1.4986149584487536,
"grad_norm": 2.9904537200927734,
"learning_rate": 2.0147556957913788e-05,
"loss": 0.3032,
"step": 10820
},
{
"epoch": 1.5,
"grad_norm": 2.0070672035217285,
"learning_rate": 2.0128996334276832e-05,
"loss": 0.2294,
"step": 10830
},
{
"epoch": 1.5013850415512464,
"grad_norm": 2.722790241241455,
"learning_rate": 2.0110435710639876e-05,
"loss": 0.2812,
"step": 10840
},
{
"epoch": 1.502770083102493,
"grad_norm": 2.189173460006714,
"learning_rate": 2.0091875087002926e-05,
"loss": 0.2594,
"step": 10850
},
{
"epoch": 1.5041551246537397,
"grad_norm": 2.6748456954956055,
"learning_rate": 2.007331446336597e-05,
"loss": 0.2776,
"step": 10860
},
{
"epoch": 1.505540166204986,
"grad_norm": 1.898979663848877,
"learning_rate": 2.0054753839729017e-05,
"loss": 0.2419,
"step": 10870
},
{
"epoch": 1.5069252077562327,
"grad_norm": 2.8842554092407227,
"learning_rate": 2.003619321609206e-05,
"loss": 0.2908,
"step": 10880
},
{
"epoch": 1.5083102493074794,
"grad_norm": 2.4269087314605713,
"learning_rate": 2.001763259245511e-05,
"loss": 0.2506,
"step": 10890
},
{
"epoch": 1.5096952908587258,
"grad_norm": 1.824051022529602,
"learning_rate": 1.9999071968818152e-05,
"loss": 0.2592,
"step": 10900
},
{
"epoch": 1.5110803324099722,
"grad_norm": 2.767005443572998,
"learning_rate": 1.99805113451812e-05,
"loss": 0.2193,
"step": 10910
},
{
"epoch": 1.5124653739612188,
"grad_norm": 2.4028172492980957,
"learning_rate": 1.9961950721544247e-05,
"loss": 0.2823,
"step": 10920
},
{
"epoch": 1.5138504155124655,
"grad_norm": 1.6937496662139893,
"learning_rate": 1.994339009790729e-05,
"loss": 0.2885,
"step": 10930
},
{
"epoch": 1.5152354570637119,
"grad_norm": 3.3346996307373047,
"learning_rate": 1.9924829474270338e-05,
"loss": 0.2499,
"step": 10940
},
{
"epoch": 1.5166204986149583,
"grad_norm": 1.9965386390686035,
"learning_rate": 1.9906268850633385e-05,
"loss": 0.2726,
"step": 10950
},
{
"epoch": 1.5180055401662051,
"grad_norm": 3.443002462387085,
"learning_rate": 1.988770822699643e-05,
"loss": 0.2485,
"step": 10960
},
{
"epoch": 1.5193905817174516,
"grad_norm": 2.690556287765503,
"learning_rate": 1.9869147603359476e-05,
"loss": 0.2588,
"step": 10970
},
{
"epoch": 1.520775623268698,
"grad_norm": 4.155259609222412,
"learning_rate": 1.985058697972252e-05,
"loss": 0.2734,
"step": 10980
},
{
"epoch": 1.5221606648199446,
"grad_norm": 1.9144017696380615,
"learning_rate": 1.9832026356085567e-05,
"loss": 0.2525,
"step": 10990
},
{
"epoch": 1.5235457063711912,
"grad_norm": 2.1264841556549072,
"learning_rate": 1.981346573244861e-05,
"loss": 0.2779,
"step": 11000
},
{
"epoch": 1.5249307479224377,
"grad_norm": 1.881961464881897,
"learning_rate": 1.9794905108811658e-05,
"loss": 0.2508,
"step": 11010
},
{
"epoch": 1.526315789473684,
"grad_norm": 2.455152750015259,
"learning_rate": 1.9776344485174705e-05,
"loss": 0.283,
"step": 11020
},
{
"epoch": 1.5277008310249307,
"grad_norm": 3.0961544513702393,
"learning_rate": 1.975778386153775e-05,
"loss": 0.2755,
"step": 11030
},
{
"epoch": 1.5290858725761773,
"grad_norm": 2.5754737854003906,
"learning_rate": 1.9739223237900796e-05,
"loss": 0.3062,
"step": 11040
},
{
"epoch": 1.5304709141274238,
"grad_norm": 2.443967580795288,
"learning_rate": 1.972066261426384e-05,
"loss": 0.2293,
"step": 11050
},
{
"epoch": 1.5318559556786704,
"grad_norm": 2.465186595916748,
"learning_rate": 1.9702101990626887e-05,
"loss": 0.2857,
"step": 11060
},
{
"epoch": 1.533240997229917,
"grad_norm": 1.6813825368881226,
"learning_rate": 1.9683541366989934e-05,
"loss": 0.2465,
"step": 11070
},
{
"epoch": 1.5346260387811634,
"grad_norm": 2.964087963104248,
"learning_rate": 1.9664980743352978e-05,
"loss": 0.2299,
"step": 11080
},
{
"epoch": 1.5360110803324099,
"grad_norm": 2.690337657928467,
"learning_rate": 1.9646420119716022e-05,
"loss": 0.2486,
"step": 11090
},
{
"epoch": 1.5373961218836565,
"grad_norm": 1.8331809043884277,
"learning_rate": 1.962785949607907e-05,
"loss": 0.2713,
"step": 11100
},
{
"epoch": 1.5387811634349031,
"grad_norm": 2.079843759536743,
"learning_rate": 1.9609298872442116e-05,
"loss": 0.269,
"step": 11110
},
{
"epoch": 1.5401662049861495,
"grad_norm": 2.1486682891845703,
"learning_rate": 1.959073824880516e-05,
"loss": 0.2159,
"step": 11120
},
{
"epoch": 1.5415512465373962,
"grad_norm": 1.5516518354415894,
"learning_rate": 1.9572177625168207e-05,
"loss": 0.2468,
"step": 11130
},
{
"epoch": 1.5429362880886428,
"grad_norm": 3.138228178024292,
"learning_rate": 1.9553617001531254e-05,
"loss": 0.2686,
"step": 11140
},
{
"epoch": 1.5443213296398892,
"grad_norm": 3.1182003021240234,
"learning_rate": 1.9535056377894298e-05,
"loss": 0.271,
"step": 11150
},
{
"epoch": 1.5457063711911356,
"grad_norm": 1.9098355770111084,
"learning_rate": 1.9516495754257345e-05,
"loss": 0.2678,
"step": 11160
},
{
"epoch": 1.5470914127423823,
"grad_norm": 2.740372657775879,
"learning_rate": 1.9497935130620393e-05,
"loss": 0.2768,
"step": 11170
},
{
"epoch": 1.548476454293629,
"grad_norm": 3.55387282371521,
"learning_rate": 1.9479374506983436e-05,
"loss": 0.26,
"step": 11180
},
{
"epoch": 1.5498614958448753,
"grad_norm": 6.384958267211914,
"learning_rate": 1.946081388334648e-05,
"loss": 0.2666,
"step": 11190
},
{
"epoch": 1.5512465373961217,
"grad_norm": 1.738592267036438,
"learning_rate": 1.9442253259709527e-05,
"loss": 0.2751,
"step": 11200
},
{
"epoch": 1.5526315789473686,
"grad_norm": 1.8383572101593018,
"learning_rate": 1.9423692636072575e-05,
"loss": 0.2539,
"step": 11210
},
{
"epoch": 1.554016620498615,
"grad_norm": 1.8780285120010376,
"learning_rate": 1.940513201243562e-05,
"loss": 0.2417,
"step": 11220
},
{
"epoch": 1.5554016620498614,
"grad_norm": 2.927269697189331,
"learning_rate": 1.9386571388798666e-05,
"loss": 0.276,
"step": 11230
},
{
"epoch": 1.556786703601108,
"grad_norm": 2.321641206741333,
"learning_rate": 1.936801076516171e-05,
"loss": 0.289,
"step": 11240
},
{
"epoch": 1.5581717451523547,
"grad_norm": 2.094604015350342,
"learning_rate": 1.9349450141524757e-05,
"loss": 0.2822,
"step": 11250
},
{
"epoch": 1.559556786703601,
"grad_norm": 2.0591633319854736,
"learning_rate": 1.9330889517887804e-05,
"loss": 0.2471,
"step": 11260
},
{
"epoch": 1.5609418282548475,
"grad_norm": 1.5026451349258423,
"learning_rate": 1.9312328894250848e-05,
"loss": 0.2624,
"step": 11270
},
{
"epoch": 1.5623268698060941,
"grad_norm": 4.523801803588867,
"learning_rate": 1.9293768270613895e-05,
"loss": 0.2795,
"step": 11280
},
{
"epoch": 1.5637119113573408,
"grad_norm": 2.1060678958892822,
"learning_rate": 1.927520764697694e-05,
"loss": 0.2823,
"step": 11290
},
{
"epoch": 1.5650969529085872,
"grad_norm": 2.6808173656463623,
"learning_rate": 1.9256647023339986e-05,
"loss": 0.2963,
"step": 11300
},
{
"epoch": 1.5664819944598338,
"grad_norm": 2.0549914836883545,
"learning_rate": 1.923808639970303e-05,
"loss": 0.2472,
"step": 11310
},
{
"epoch": 1.5678670360110805,
"grad_norm": 2.008946657180786,
"learning_rate": 1.9219525776066077e-05,
"loss": 0.2275,
"step": 11320
},
{
"epoch": 1.5692520775623269,
"grad_norm": 1.7144265174865723,
"learning_rate": 1.9200965152429124e-05,
"loss": 0.2256,
"step": 11330
},
{
"epoch": 1.5706371191135733,
"grad_norm": 2.5419716835021973,
"learning_rate": 1.9182404528792168e-05,
"loss": 0.2723,
"step": 11340
},
{
"epoch": 1.57202216066482,
"grad_norm": 3.4930813312530518,
"learning_rate": 1.9163843905155215e-05,
"loss": 0.3037,
"step": 11350
},
{
"epoch": 1.5734072022160666,
"grad_norm": 5.525070667266846,
"learning_rate": 1.9145283281518262e-05,
"loss": 0.2941,
"step": 11360
},
{
"epoch": 1.574792243767313,
"grad_norm": 2.0388412475585938,
"learning_rate": 1.9126722657881306e-05,
"loss": 0.2377,
"step": 11370
},
{
"epoch": 1.5761772853185596,
"grad_norm": 3.1941895484924316,
"learning_rate": 1.9108162034244353e-05,
"loss": 0.247,
"step": 11380
},
{
"epoch": 1.5775623268698062,
"grad_norm": 2.0771210193634033,
"learning_rate": 1.90896014106074e-05,
"loss": 0.2612,
"step": 11390
},
{
"epoch": 1.5789473684210527,
"grad_norm": 1.8206599950790405,
"learning_rate": 1.9071040786970444e-05,
"loss": 0.2669,
"step": 11400
},
{
"epoch": 1.580332409972299,
"grad_norm": 2.1787009239196777,
"learning_rate": 1.9052480163333488e-05,
"loss": 0.2582,
"step": 11410
},
{
"epoch": 1.5817174515235457,
"grad_norm": 2.1410577297210693,
"learning_rate": 1.9033919539696535e-05,
"loss": 0.262,
"step": 11420
},
{
"epoch": 1.5831024930747923,
"grad_norm": 2.490060329437256,
"learning_rate": 1.901535891605958e-05,
"loss": 0.2337,
"step": 11430
},
{
"epoch": 1.5844875346260388,
"grad_norm": 2.937830924987793,
"learning_rate": 1.8996798292422626e-05,
"loss": 0.3014,
"step": 11440
},
{
"epoch": 1.5858725761772852,
"grad_norm": 2.0263595581054688,
"learning_rate": 1.8978237668785674e-05,
"loss": 0.2887,
"step": 11450
},
{
"epoch": 1.587257617728532,
"grad_norm": 1.8784677982330322,
"learning_rate": 1.8959677045148717e-05,
"loss": 0.2547,
"step": 11460
},
{
"epoch": 1.5886426592797784,
"grad_norm": 2.2667269706726074,
"learning_rate": 1.8941116421511765e-05,
"loss": 0.2693,
"step": 11470
},
{
"epoch": 1.5900277008310248,
"grad_norm": 2.0640201568603516,
"learning_rate": 1.8922555797874812e-05,
"loss": 0.2414,
"step": 11480
},
{
"epoch": 1.5914127423822715,
"grad_norm": 2.125096559524536,
"learning_rate": 1.8903995174237856e-05,
"loss": 0.2657,
"step": 11490
},
{
"epoch": 1.5927977839335181,
"grad_norm": 3.6004934310913086,
"learning_rate": 1.88854345506009e-05,
"loss": 0.2973,
"step": 11500
},
{
"epoch": 1.5941828254847645,
"grad_norm": 2.045348644256592,
"learning_rate": 1.8866873926963947e-05,
"loss": 0.2659,
"step": 11510
},
{
"epoch": 1.595567867036011,
"grad_norm": 1.4382702112197876,
"learning_rate": 1.8848313303326994e-05,
"loss": 0.2434,
"step": 11520
},
{
"epoch": 1.5969529085872576,
"grad_norm": 2.119570016860962,
"learning_rate": 1.8829752679690038e-05,
"loss": 0.3205,
"step": 11530
},
{
"epoch": 1.5983379501385042,
"grad_norm": 1.7247291803359985,
"learning_rate": 1.8811192056053085e-05,
"loss": 0.2501,
"step": 11540
},
{
"epoch": 1.5997229916897506,
"grad_norm": 3.151301145553589,
"learning_rate": 1.8792631432416132e-05,
"loss": 0.3,
"step": 11550
},
{
"epoch": 1.6011080332409973,
"grad_norm": 2.6698825359344482,
"learning_rate": 1.8774070808779176e-05,
"loss": 0.2233,
"step": 11560
},
{
"epoch": 1.602493074792244,
"grad_norm": 3.476079225540161,
"learning_rate": 1.8755510185142223e-05,
"loss": 0.3094,
"step": 11570
},
{
"epoch": 1.6038781163434903,
"grad_norm": 1.5601874589920044,
"learning_rate": 1.873694956150527e-05,
"loss": 0.2747,
"step": 11580
},
{
"epoch": 1.6052631578947367,
"grad_norm": 2.1733288764953613,
"learning_rate": 1.8718388937868314e-05,
"loss": 0.2538,
"step": 11590
},
{
"epoch": 1.6066481994459834,
"grad_norm": 1.6128807067871094,
"learning_rate": 1.869982831423136e-05,
"loss": 0.2436,
"step": 11600
},
{
"epoch": 1.60803324099723,
"grad_norm": 6.194311618804932,
"learning_rate": 1.8681267690594405e-05,
"loss": 0.2285,
"step": 11610
},
{
"epoch": 1.6094182825484764,
"grad_norm": 1.6359772682189941,
"learning_rate": 1.8662707066957452e-05,
"loss": 0.286,
"step": 11620
},
{
"epoch": 1.610803324099723,
"grad_norm": 1.7897950410842896,
"learning_rate": 1.8644146443320496e-05,
"loss": 0.2714,
"step": 11630
},
{
"epoch": 1.6121883656509697,
"grad_norm": 3.663844585418701,
"learning_rate": 1.8625585819683543e-05,
"loss": 0.2954,
"step": 11640
},
{
"epoch": 1.613573407202216,
"grad_norm": 2.3490071296691895,
"learning_rate": 1.8607025196046587e-05,
"loss": 0.2516,
"step": 11650
},
{
"epoch": 1.6149584487534625,
"grad_norm": 1.6959421634674072,
"learning_rate": 1.8588464572409634e-05,
"loss": 0.2355,
"step": 11660
},
{
"epoch": 1.6163434903047091,
"grad_norm": 4.680472373962402,
"learning_rate": 1.856990394877268e-05,
"loss": 0.2936,
"step": 11670
},
{
"epoch": 1.6177285318559558,
"grad_norm": 3.6040542125701904,
"learning_rate": 1.8551343325135725e-05,
"loss": 0.2481,
"step": 11680
},
{
"epoch": 1.6191135734072022,
"grad_norm": 2.7770557403564453,
"learning_rate": 1.8532782701498772e-05,
"loss": 0.2705,
"step": 11690
},
{
"epoch": 1.6204986149584486,
"grad_norm": 2.4273083209991455,
"learning_rate": 1.851422207786182e-05,
"loss": 0.2342,
"step": 11700
},
{
"epoch": 1.6218836565096952,
"grad_norm": 3.629061460494995,
"learning_rate": 1.8495661454224863e-05,
"loss": 0.2755,
"step": 11710
},
{
"epoch": 1.6232686980609419,
"grad_norm": 1.9627671241760254,
"learning_rate": 1.8477100830587907e-05,
"loss": 0.2337,
"step": 11720
},
{
"epoch": 1.6246537396121883,
"grad_norm": 2.104898452758789,
"learning_rate": 1.8458540206950954e-05,
"loss": 0.2648,
"step": 11730
},
{
"epoch": 1.626038781163435,
"grad_norm": 3.2535183429718018,
"learning_rate": 1.8439979583314e-05,
"loss": 0.2304,
"step": 11740
},
{
"epoch": 1.6274238227146816,
"grad_norm": 3.0654265880584717,
"learning_rate": 1.8421418959677045e-05,
"loss": 0.2601,
"step": 11750
},
{
"epoch": 1.628808864265928,
"grad_norm": 5.312877655029297,
"learning_rate": 1.8402858336040093e-05,
"loss": 0.2391,
"step": 11760
},
{
"epoch": 1.6301939058171744,
"grad_norm": 2.483633518218994,
"learning_rate": 1.838429771240314e-05,
"loss": 0.2387,
"step": 11770
},
{
"epoch": 1.631578947368421,
"grad_norm": 2.8224122524261475,
"learning_rate": 1.8365737088766184e-05,
"loss": 0.2573,
"step": 11780
},
{
"epoch": 1.6329639889196677,
"grad_norm": 5.308402061462402,
"learning_rate": 1.834717646512923e-05,
"loss": 0.2999,
"step": 11790
},
{
"epoch": 1.634349030470914,
"grad_norm": 3.696424722671509,
"learning_rate": 1.8328615841492278e-05,
"loss": 0.2781,
"step": 11800
},
{
"epoch": 1.6357340720221607,
"grad_norm": 1.6067745685577393,
"learning_rate": 1.8310055217855322e-05,
"loss": 0.2638,
"step": 11810
},
{
"epoch": 1.6371191135734073,
"grad_norm": 1.8693360090255737,
"learning_rate": 1.8291494594218366e-05,
"loss": 0.2355,
"step": 11820
},
{
"epoch": 1.6385041551246537,
"grad_norm": 2.0607070922851562,
"learning_rate": 1.8272933970581413e-05,
"loss": 0.2543,
"step": 11830
},
{
"epoch": 1.6398891966759002,
"grad_norm": 1.770366907119751,
"learning_rate": 1.8254373346944457e-05,
"loss": 0.2417,
"step": 11840
},
{
"epoch": 1.6412742382271468,
"grad_norm": 2.100710153579712,
"learning_rate": 1.8235812723307504e-05,
"loss": 0.2341,
"step": 11850
},
{
"epoch": 1.6426592797783934,
"grad_norm": 3.348633050918579,
"learning_rate": 1.821725209967055e-05,
"loss": 0.2537,
"step": 11860
},
{
"epoch": 1.6440443213296398,
"grad_norm": 2.102045774459839,
"learning_rate": 1.8198691476033595e-05,
"loss": 0.2416,
"step": 11870
},
{
"epoch": 1.6454293628808865,
"grad_norm": 4.399439334869385,
"learning_rate": 1.8180130852396642e-05,
"loss": 0.2804,
"step": 11880
},
{
"epoch": 1.6468144044321331,
"grad_norm": 2.43581485748291,
"learning_rate": 1.816157022875969e-05,
"loss": 0.2577,
"step": 11890
},
{
"epoch": 1.6481994459833795,
"grad_norm": 5.009562015533447,
"learning_rate": 1.8143009605122733e-05,
"loss": 0.2739,
"step": 11900
},
{
"epoch": 1.649584487534626,
"grad_norm": 1.979146122932434,
"learning_rate": 1.812444898148578e-05,
"loss": 0.243,
"step": 11910
},
{
"epoch": 1.6509695290858726,
"grad_norm": 2.0031955242156982,
"learning_rate": 1.8105888357848824e-05,
"loss": 0.2533,
"step": 11920
},
{
"epoch": 1.6523545706371192,
"grad_norm": 2.9630982875823975,
"learning_rate": 1.808732773421187e-05,
"loss": 0.3004,
"step": 11930
},
{
"epoch": 1.6537396121883656,
"grad_norm": 4.047551155090332,
"learning_rate": 1.8068767110574915e-05,
"loss": 0.2414,
"step": 11940
},
{
"epoch": 1.655124653739612,
"grad_norm": 2.5093228816986084,
"learning_rate": 1.8050206486937962e-05,
"loss": 0.2627,
"step": 11950
},
{
"epoch": 1.6565096952908587,
"grad_norm": 2.37447190284729,
"learning_rate": 1.803164586330101e-05,
"loss": 0.2665,
"step": 11960
},
{
"epoch": 1.6578947368421053,
"grad_norm": 3.737452507019043,
"learning_rate": 1.8013085239664053e-05,
"loss": 0.2694,
"step": 11970
},
{
"epoch": 1.6592797783933517,
"grad_norm": 2.8096158504486084,
"learning_rate": 1.79945246160271e-05,
"loss": 0.2797,
"step": 11980
},
{
"epoch": 1.6606648199445984,
"grad_norm": 1.845086932182312,
"learning_rate": 1.7975963992390148e-05,
"loss": 0.2508,
"step": 11990
},
{
"epoch": 1.662049861495845,
"grad_norm": 3.416377067565918,
"learning_rate": 1.795740336875319e-05,
"loss": 0.2383,
"step": 12000
},
{
"epoch": 1.6626038781163435,
"eval_loss": 0.29201897978782654,
"eval_runtime": 1417.9458,
"eval_samples_per_second": 6.432,
"eval_steps_per_second": 0.804,
"step": 12004
},
{
"epoch": 1.6634349030470914,
"grad_norm": 2.057034730911255,
"learning_rate": 1.793884274511624e-05,
"loss": 0.2622,
"step": 12010
},
{
"epoch": 1.6648199445983378,
"grad_norm": 2.438023805618286,
"learning_rate": 1.7920282121479283e-05,
"loss": 0.3083,
"step": 12020
},
{
"epoch": 1.6662049861495845,
"grad_norm": 4.761218547821045,
"learning_rate": 1.790172149784233e-05,
"loss": 0.2488,
"step": 12030
},
{
"epoch": 1.667590027700831,
"grad_norm": 3.3737921714782715,
"learning_rate": 1.7883160874205374e-05,
"loss": 0.2418,
"step": 12040
},
{
"epoch": 1.6689750692520775,
"grad_norm": 2.7591564655303955,
"learning_rate": 1.786460025056842e-05,
"loss": 0.2849,
"step": 12050
},
{
"epoch": 1.6703601108033241,
"grad_norm": 2.5756118297576904,
"learning_rate": 1.7846039626931465e-05,
"loss": 0.2317,
"step": 12060
},
{
"epoch": 1.6717451523545708,
"grad_norm": 2.3147664070129395,
"learning_rate": 1.7827479003294512e-05,
"loss": 0.2806,
"step": 12070
},
{
"epoch": 1.6731301939058172,
"grad_norm": 3.616359233856201,
"learning_rate": 1.780891837965756e-05,
"loss": 0.2557,
"step": 12080
},
{
"epoch": 1.6745152354570636,
"grad_norm": 2.078604221343994,
"learning_rate": 1.7790357756020603e-05,
"loss": 0.2752,
"step": 12090
},
{
"epoch": 1.6759002770083102,
"grad_norm": 1.7128912210464478,
"learning_rate": 1.777179713238365e-05,
"loss": 0.2697,
"step": 12100
},
{
"epoch": 1.6772853185595569,
"grad_norm": 1.9377413988113403,
"learning_rate": 1.7753236508746697e-05,
"loss": 0.2526,
"step": 12110
},
{
"epoch": 1.6786703601108033,
"grad_norm": 2.5126800537109375,
"learning_rate": 1.773467588510974e-05,
"loss": 0.2824,
"step": 12120
},
{
"epoch": 1.6800554016620497,
"grad_norm": 1.8020765781402588,
"learning_rate": 1.7716115261472785e-05,
"loss": 0.2659,
"step": 12130
},
{
"epoch": 1.6814404432132966,
"grad_norm": 2.5396931171417236,
"learning_rate": 1.7697554637835832e-05,
"loss": 0.2583,
"step": 12140
},
{
"epoch": 1.682825484764543,
"grad_norm": 2.1180899143218994,
"learning_rate": 1.767899401419888e-05,
"loss": 0.2561,
"step": 12150
},
{
"epoch": 1.6842105263157894,
"grad_norm": 2.3820462226867676,
"learning_rate": 1.7660433390561923e-05,
"loss": 0.2675,
"step": 12160
},
{
"epoch": 1.685595567867036,
"grad_norm": 2.2656431198120117,
"learning_rate": 1.764187276692497e-05,
"loss": 0.2718,
"step": 12170
},
{
"epoch": 1.6869806094182827,
"grad_norm": 1.6295078992843628,
"learning_rate": 1.7623312143288017e-05,
"loss": 0.2418,
"step": 12180
},
{
"epoch": 1.688365650969529,
"grad_norm": 2.898495674133301,
"learning_rate": 1.760475151965106e-05,
"loss": 0.2562,
"step": 12190
},
{
"epoch": 1.6897506925207755,
"grad_norm": 2.290031671524048,
"learning_rate": 1.758619089601411e-05,
"loss": 0.262,
"step": 12200
},
{
"epoch": 1.6911357340720221,
"grad_norm": 3.4145724773406982,
"learning_rate": 1.7567630272377156e-05,
"loss": 0.2502,
"step": 12210
},
{
"epoch": 1.6925207756232687,
"grad_norm": 1.9184894561767578,
"learning_rate": 1.75490696487402e-05,
"loss": 0.2703,
"step": 12220
},
{
"epoch": 1.6939058171745152,
"grad_norm": 1.8107856512069702,
"learning_rate": 1.7530509025103243e-05,
"loss": 0.2628,
"step": 12230
},
{
"epoch": 1.6952908587257618,
"grad_norm": 3.8108813762664795,
"learning_rate": 1.751194840146629e-05,
"loss": 0.2893,
"step": 12240
},
{
"epoch": 1.6966759002770084,
"grad_norm": 2.709773063659668,
"learning_rate": 1.7493387777829334e-05,
"loss": 0.2641,
"step": 12250
},
{
"epoch": 1.6980609418282548,
"grad_norm": 1.906054139137268,
"learning_rate": 1.747482715419238e-05,
"loss": 0.2278,
"step": 12260
},
{
"epoch": 1.6994459833795013,
"grad_norm": 1.6972445249557495,
"learning_rate": 1.745626653055543e-05,
"loss": 0.2602,
"step": 12270
},
{
"epoch": 1.700831024930748,
"grad_norm": 2.5641844272613525,
"learning_rate": 1.7437705906918473e-05,
"loss": 0.2422,
"step": 12280
},
{
"epoch": 1.7022160664819945,
"grad_norm": 3.255160093307495,
"learning_rate": 1.741914528328152e-05,
"loss": 0.242,
"step": 12290
},
{
"epoch": 1.703601108033241,
"grad_norm": 1.6492815017700195,
"learning_rate": 1.7400584659644567e-05,
"loss": 0.2636,
"step": 12300
},
{
"epoch": 1.7049861495844876,
"grad_norm": 2.919952630996704,
"learning_rate": 1.738202403600761e-05,
"loss": 0.2592,
"step": 12310
},
{
"epoch": 1.7063711911357342,
"grad_norm": 2.054386854171753,
"learning_rate": 1.7363463412370658e-05,
"loss": 0.2757,
"step": 12320
},
{
"epoch": 1.7077562326869806,
"grad_norm": 3.644105911254883,
"learning_rate": 1.7344902788733702e-05,
"loss": 0.2622,
"step": 12330
},
{
"epoch": 1.709141274238227,
"grad_norm": 1.5332365036010742,
"learning_rate": 1.732634216509675e-05,
"loss": 0.261,
"step": 12340
},
{
"epoch": 1.7105263157894737,
"grad_norm": 1.936841368675232,
"learning_rate": 1.7307781541459793e-05,
"loss": 0.2524,
"step": 12350
},
{
"epoch": 1.7119113573407203,
"grad_norm": 2.621289014816284,
"learning_rate": 1.728922091782284e-05,
"loss": 0.2641,
"step": 12360
},
{
"epoch": 1.7132963988919667,
"grad_norm": 2.3396177291870117,
"learning_rate": 1.7270660294185887e-05,
"loss": 0.2534,
"step": 12370
},
{
"epoch": 1.7146814404432131,
"grad_norm": 4.555574893951416,
"learning_rate": 1.725209967054893e-05,
"loss": 0.2479,
"step": 12380
},
{
"epoch": 1.71606648199446,
"grad_norm": 2.390497922897339,
"learning_rate": 1.7233539046911978e-05,
"loss": 0.2653,
"step": 12390
},
{
"epoch": 1.7174515235457064,
"grad_norm": 2.245471954345703,
"learning_rate": 1.7214978423275025e-05,
"loss": 0.2436,
"step": 12400
},
{
"epoch": 1.7188365650969528,
"grad_norm": 2.013014554977417,
"learning_rate": 1.719641779963807e-05,
"loss": 0.2447,
"step": 12410
},
{
"epoch": 1.7202216066481995,
"grad_norm": 2.733628988265991,
"learning_rate": 1.7177857176001116e-05,
"loss": 0.215,
"step": 12420
},
{
"epoch": 1.721606648199446,
"grad_norm": 3.422853946685791,
"learning_rate": 1.7159296552364164e-05,
"loss": 0.2922,
"step": 12430
},
{
"epoch": 1.7229916897506925,
"grad_norm": 1.407198429107666,
"learning_rate": 1.7140735928727207e-05,
"loss": 0.2427,
"step": 12440
},
{
"epoch": 1.724376731301939,
"grad_norm": 2.1980371475219727,
"learning_rate": 1.712217530509025e-05,
"loss": 0.2996,
"step": 12450
},
{
"epoch": 1.7257617728531855,
"grad_norm": 2.1491382122039795,
"learning_rate": 1.71036146814533e-05,
"loss": 0.2432,
"step": 12460
},
{
"epoch": 1.7271468144044322,
"grad_norm": 3.203516960144043,
"learning_rate": 1.7085054057816342e-05,
"loss": 0.2397,
"step": 12470
},
{
"epoch": 1.7285318559556786,
"grad_norm": 1.9788877964019775,
"learning_rate": 1.706649343417939e-05,
"loss": 0.2265,
"step": 12480
},
{
"epoch": 1.7299168975069252,
"grad_norm": 2.2260544300079346,
"learning_rate": 1.7047932810542437e-05,
"loss": 0.2524,
"step": 12490
},
{
"epoch": 1.7313019390581719,
"grad_norm": 3.5737645626068115,
"learning_rate": 1.702937218690548e-05,
"loss": 0.2211,
"step": 12500
},
{
"epoch": 1.7326869806094183,
"grad_norm": 2.1288580894470215,
"learning_rate": 1.7010811563268528e-05,
"loss": 0.2408,
"step": 12510
},
{
"epoch": 1.7340720221606647,
"grad_norm": 2.047696113586426,
"learning_rate": 1.6992250939631575e-05,
"loss": 0.2221,
"step": 12520
},
{
"epoch": 1.7354570637119113,
"grad_norm": 2.0065062046051025,
"learning_rate": 1.697369031599462e-05,
"loss": 0.2525,
"step": 12530
},
{
"epoch": 1.736842105263158,
"grad_norm": 11.675251007080078,
"learning_rate": 1.6955129692357662e-05,
"loss": 0.3013,
"step": 12540
},
{
"epoch": 1.7382271468144044,
"grad_norm": 3.4562997817993164,
"learning_rate": 1.693656906872071e-05,
"loss": 0.3053,
"step": 12550
},
{
"epoch": 1.739612188365651,
"grad_norm": 1.8160446882247925,
"learning_rate": 1.6918008445083757e-05,
"loss": 0.253,
"step": 12560
},
{
"epoch": 1.7409972299168976,
"grad_norm": 2.6544394493103027,
"learning_rate": 1.68994478214468e-05,
"loss": 0.2457,
"step": 12570
},
{
"epoch": 1.742382271468144,
"grad_norm": 2.2523653507232666,
"learning_rate": 1.6880887197809848e-05,
"loss": 0.2778,
"step": 12580
},
{
"epoch": 1.7437673130193905,
"grad_norm": 3.4826242923736572,
"learning_rate": 1.6862326574172895e-05,
"loss": 0.2264,
"step": 12590
},
{
"epoch": 1.745152354570637,
"grad_norm": 3.331815242767334,
"learning_rate": 1.684376595053594e-05,
"loss": 0.2287,
"step": 12600
},
{
"epoch": 1.7465373961218837,
"grad_norm": 2.1905879974365234,
"learning_rate": 1.6825205326898986e-05,
"loss": 0.2654,
"step": 12610
},
{
"epoch": 1.7479224376731302,
"grad_norm": 1.8256255388259888,
"learning_rate": 1.6806644703262033e-05,
"loss": 0.2156,
"step": 12620
},
{
"epoch": 1.7493074792243766,
"grad_norm": 2.4552223682403564,
"learning_rate": 1.6788084079625077e-05,
"loss": 0.2542,
"step": 12630
},
{
"epoch": 1.7506925207756234,
"grad_norm": 2.0152883529663086,
"learning_rate": 1.676952345598812e-05,
"loss": 0.2457,
"step": 12640
},
{
"epoch": 1.7520775623268698,
"grad_norm": 2.6501598358154297,
"learning_rate": 1.6750962832351168e-05,
"loss": 0.2577,
"step": 12650
},
{
"epoch": 1.7534626038781163,
"grad_norm": 2.4002187252044678,
"learning_rate": 1.6732402208714212e-05,
"loss": 0.2704,
"step": 12660
},
{
"epoch": 1.754847645429363,
"grad_norm": 1.940310001373291,
"learning_rate": 1.671384158507726e-05,
"loss": 0.264,
"step": 12670
},
{
"epoch": 1.7562326869806095,
"grad_norm": 2.7032876014709473,
"learning_rate": 1.6695280961440306e-05,
"loss": 0.2555,
"step": 12680
},
{
"epoch": 1.757617728531856,
"grad_norm": 3.1682300567626953,
"learning_rate": 1.667672033780335e-05,
"loss": 0.2463,
"step": 12690
},
{
"epoch": 1.7590027700831024,
"grad_norm": 1.8988887071609497,
"learning_rate": 1.6658159714166397e-05,
"loss": 0.2538,
"step": 12700
},
{
"epoch": 1.760387811634349,
"grad_norm": 2.3494110107421875,
"learning_rate": 1.6639599090529444e-05,
"loss": 0.2371,
"step": 12710
},
{
"epoch": 1.7617728531855956,
"grad_norm": 1.774849772453308,
"learning_rate": 1.6621038466892488e-05,
"loss": 0.3002,
"step": 12720
},
{
"epoch": 1.763157894736842,
"grad_norm": 9.336555480957031,
"learning_rate": 1.6602477843255536e-05,
"loss": 0.2931,
"step": 12730
},
{
"epoch": 1.7645429362880887,
"grad_norm": 1.589762568473816,
"learning_rate": 1.6583917219618583e-05,
"loss": 0.2275,
"step": 12740
},
{
"epoch": 1.7659279778393353,
"grad_norm": 2.8760030269622803,
"learning_rate": 1.6565356595981627e-05,
"loss": 0.2964,
"step": 12750
},
{
"epoch": 1.7673130193905817,
"grad_norm": 2.5935494899749756,
"learning_rate": 1.654679597234467e-05,
"loss": 0.2206,
"step": 12760
},
{
"epoch": 1.7686980609418281,
"grad_norm": 1.6379286050796509,
"learning_rate": 1.6528235348707718e-05,
"loss": 0.2307,
"step": 12770
},
{
"epoch": 1.7700831024930748,
"grad_norm": 3.137521266937256,
"learning_rate": 1.6509674725070765e-05,
"loss": 0.2435,
"step": 12780
},
{
"epoch": 1.7714681440443214,
"grad_norm": 2.229773998260498,
"learning_rate": 1.649111410143381e-05,
"loss": 0.2427,
"step": 12790
},
{
"epoch": 1.7728531855955678,
"grad_norm": 1.566584587097168,
"learning_rate": 1.6472553477796856e-05,
"loss": 0.2731,
"step": 12800
},
{
"epoch": 1.7742382271468145,
"grad_norm": 1.718691110610962,
"learning_rate": 1.6453992854159903e-05,
"loss": 0.2549,
"step": 12810
},
{
"epoch": 1.775623268698061,
"grad_norm": 2.1555016040802,
"learning_rate": 1.6435432230522947e-05,
"loss": 0.2222,
"step": 12820
},
{
"epoch": 1.7770083102493075,
"grad_norm": 1.805146336555481,
"learning_rate": 1.6416871606885994e-05,
"loss": 0.2555,
"step": 12830
},
{
"epoch": 1.778393351800554,
"grad_norm": 1.7481995820999146,
"learning_rate": 1.639831098324904e-05,
"loss": 0.256,
"step": 12840
},
{
"epoch": 1.7797783933518005,
"grad_norm": 1.9545890092849731,
"learning_rate": 1.6379750359612085e-05,
"loss": 0.2571,
"step": 12850
},
{
"epoch": 1.7811634349030472,
"grad_norm": 1.8375189304351807,
"learning_rate": 1.636118973597513e-05,
"loss": 0.2388,
"step": 12860
},
{
"epoch": 1.7825484764542936,
"grad_norm": 1.7798672914505005,
"learning_rate": 1.6342629112338176e-05,
"loss": 0.2692,
"step": 12870
},
{
"epoch": 1.78393351800554,
"grad_norm": 4.45228385925293,
"learning_rate": 1.632406848870122e-05,
"loss": 0.2504,
"step": 12880
},
{
"epoch": 1.7853185595567869,
"grad_norm": 1.8389291763305664,
"learning_rate": 1.6305507865064267e-05,
"loss": 0.2507,
"step": 12890
},
{
"epoch": 1.7867036011080333,
"grad_norm": 1.5367673635482788,
"learning_rate": 1.6286947241427314e-05,
"loss": 0.2639,
"step": 12900
},
{
"epoch": 1.7880886426592797,
"grad_norm": 2.4199717044830322,
"learning_rate": 1.6268386617790358e-05,
"loss": 0.2615,
"step": 12910
},
{
"epoch": 1.7894736842105263,
"grad_norm": 2.12934947013855,
"learning_rate": 1.6249825994153405e-05,
"loss": 0.2774,
"step": 12920
},
{
"epoch": 1.790858725761773,
"grad_norm": 1.7487562894821167,
"learning_rate": 1.6231265370516452e-05,
"loss": 0.3096,
"step": 12930
},
{
"epoch": 1.7922437673130194,
"grad_norm": 1.8843803405761719,
"learning_rate": 1.6212704746879496e-05,
"loss": 0.2188,
"step": 12940
},
{
"epoch": 1.7936288088642658,
"grad_norm": 2.1922812461853027,
"learning_rate": 1.6194144123242543e-05,
"loss": 0.267,
"step": 12950
},
{
"epoch": 1.7950138504155124,
"grad_norm": 1.6102290153503418,
"learning_rate": 1.6175583499605587e-05,
"loss": 0.2425,
"step": 12960
},
{
"epoch": 1.796398891966759,
"grad_norm": 2.733360767364502,
"learning_rate": 1.6157022875968634e-05,
"loss": 0.2836,
"step": 12970
},
{
"epoch": 1.7977839335180055,
"grad_norm": 2.1498701572418213,
"learning_rate": 1.6138462252331678e-05,
"loss": 0.2823,
"step": 12980
},
{
"epoch": 1.799168975069252,
"grad_norm": 1.5078741312026978,
"learning_rate": 1.6119901628694725e-05,
"loss": 0.2216,
"step": 12990
},
{
"epoch": 1.8005540166204987,
"grad_norm": 1.9320144653320312,
"learning_rate": 1.6101341005057773e-05,
"loss": 0.2468,
"step": 13000
},
{
"epoch": 1.8019390581717452,
"grad_norm": 1.9885637760162354,
"learning_rate": 1.6082780381420816e-05,
"loss": 0.2413,
"step": 13010
},
{
"epoch": 1.8033240997229916,
"grad_norm": 2.5898048877716064,
"learning_rate": 1.6064219757783864e-05,
"loss": 0.2327,
"step": 13020
},
{
"epoch": 1.8047091412742382,
"grad_norm": 1.5741478204727173,
"learning_rate": 1.604565913414691e-05,
"loss": 0.2658,
"step": 13030
},
{
"epoch": 1.8060941828254848,
"grad_norm": 2.036979913711548,
"learning_rate": 1.6027098510509955e-05,
"loss": 0.2436,
"step": 13040
},
{
"epoch": 1.8074792243767313,
"grad_norm": 1.650017499923706,
"learning_rate": 1.6008537886873002e-05,
"loss": 0.2696,
"step": 13050
},
{
"epoch": 1.8088642659279779,
"grad_norm": 2.0959882736206055,
"learning_rate": 1.5989977263236046e-05,
"loss": 0.247,
"step": 13060
},
{
"epoch": 1.8102493074792245,
"grad_norm": 2.870182514190674,
"learning_rate": 1.597141663959909e-05,
"loss": 0.2529,
"step": 13070
},
{
"epoch": 1.811634349030471,
"grad_norm": 1.9680346250534058,
"learning_rate": 1.5952856015962137e-05,
"loss": 0.2584,
"step": 13080
},
{
"epoch": 1.8130193905817173,
"grad_norm": 3.2609808444976807,
"learning_rate": 1.5934295392325184e-05,
"loss": 0.2222,
"step": 13090
},
{
"epoch": 1.814404432132964,
"grad_norm": 1.9616683721542358,
"learning_rate": 1.5915734768688228e-05,
"loss": 0.2617,
"step": 13100
},
{
"epoch": 1.8157894736842106,
"grad_norm": 1.50966215133667,
"learning_rate": 1.5897174145051275e-05,
"loss": 0.2489,
"step": 13110
},
{
"epoch": 1.817174515235457,
"grad_norm": 1.8157463073730469,
"learning_rate": 1.5878613521414322e-05,
"loss": 0.2753,
"step": 13120
},
{
"epoch": 1.8185595567867034,
"grad_norm": 2.0493199825286865,
"learning_rate": 1.5860052897777366e-05,
"loss": 0.2452,
"step": 13130
},
{
"epoch": 1.8199445983379503,
"grad_norm": 1.4649819135665894,
"learning_rate": 1.5841492274140413e-05,
"loss": 0.2371,
"step": 13140
},
{
"epoch": 1.8213296398891967,
"grad_norm": 2.031050205230713,
"learning_rate": 1.582293165050346e-05,
"loss": 0.2287,
"step": 13150
},
{
"epoch": 1.8227146814404431,
"grad_norm": 1.8526753187179565,
"learning_rate": 1.5804371026866504e-05,
"loss": 0.2441,
"step": 13160
},
{
"epoch": 1.8240997229916898,
"grad_norm": 1.8726290464401245,
"learning_rate": 1.5785810403229548e-05,
"loss": 0.2684,
"step": 13170
},
{
"epoch": 1.8254847645429364,
"grad_norm": 2.1293463706970215,
"learning_rate": 1.5767249779592595e-05,
"loss": 0.2394,
"step": 13180
},
{
"epoch": 1.8268698060941828,
"grad_norm": 2.2473442554473877,
"learning_rate": 1.5748689155955642e-05,
"loss": 0.2238,
"step": 13190
},
{
"epoch": 1.8282548476454292,
"grad_norm": 12.403087615966797,
"learning_rate": 1.5730128532318686e-05,
"loss": 0.2396,
"step": 13200
},
{
"epoch": 1.8296398891966759,
"grad_norm": 2.3015928268432617,
"learning_rate": 1.5711567908681733e-05,
"loss": 0.2427,
"step": 13210
},
{
"epoch": 1.8310249307479225,
"grad_norm": 1.473212480545044,
"learning_rate": 1.569300728504478e-05,
"loss": 0.2138,
"step": 13220
},
{
"epoch": 1.832409972299169,
"grad_norm": 1.7529211044311523,
"learning_rate": 1.5674446661407824e-05,
"loss": 0.2489,
"step": 13230
},
{
"epoch": 1.8337950138504155,
"grad_norm": 3.970229387283325,
"learning_rate": 1.565588603777087e-05,
"loss": 0.2636,
"step": 13240
},
{
"epoch": 1.8351800554016622,
"grad_norm": 2.284043312072754,
"learning_rate": 1.563732541413392e-05,
"loss": 0.2571,
"step": 13250
},
{
"epoch": 1.8365650969529086,
"grad_norm": 2.4445765018463135,
"learning_rate": 1.5618764790496963e-05,
"loss": 0.219,
"step": 13260
},
{
"epoch": 1.837950138504155,
"grad_norm": 4.571364879608154,
"learning_rate": 1.5600204166860006e-05,
"loss": 0.2162,
"step": 13270
},
{
"epoch": 1.8393351800554016,
"grad_norm": 2.4418375492095947,
"learning_rate": 1.5581643543223054e-05,
"loss": 0.2431,
"step": 13280
},
{
"epoch": 1.8407202216066483,
"grad_norm": 2.472238302230835,
"learning_rate": 1.5563082919586097e-05,
"loss": 0.2483,
"step": 13290
},
{
"epoch": 1.8421052631578947,
"grad_norm": 2.308302879333496,
"learning_rate": 1.5544522295949145e-05,
"loss": 0.2642,
"step": 13300
},
{
"epoch": 1.8434903047091413,
"grad_norm": 1.9199141263961792,
"learning_rate": 1.5525961672312192e-05,
"loss": 0.2178,
"step": 13310
},
{
"epoch": 1.844875346260388,
"grad_norm": 1.79179847240448,
"learning_rate": 1.5507401048675236e-05,
"loss": 0.2351,
"step": 13320
},
{
"epoch": 1.8462603878116344,
"grad_norm": 2.497844934463501,
"learning_rate": 1.5488840425038283e-05,
"loss": 0.2773,
"step": 13330
},
{
"epoch": 1.8476454293628808,
"grad_norm": 2.2991247177124023,
"learning_rate": 1.547027980140133e-05,
"loss": 0.2314,
"step": 13340
},
{
"epoch": 1.8490304709141274,
"grad_norm": 1.5995428562164307,
"learning_rate": 1.5451719177764374e-05,
"loss": 0.2704,
"step": 13350
},
{
"epoch": 1.850415512465374,
"grad_norm": 3.168255567550659,
"learning_rate": 1.543315855412742e-05,
"loss": 0.2568,
"step": 13360
},
{
"epoch": 1.8518005540166205,
"grad_norm": 2.057743787765503,
"learning_rate": 1.5414597930490465e-05,
"loss": 0.2159,
"step": 13370
},
{
"epoch": 1.8531855955678669,
"grad_norm": 1.9587434530258179,
"learning_rate": 1.5396037306853512e-05,
"loss": 0.2616,
"step": 13380
},
{
"epoch": 1.8545706371191135,
"grad_norm": 1.7825109958648682,
"learning_rate": 1.5377476683216556e-05,
"loss": 0.2419,
"step": 13390
},
{
"epoch": 1.8559556786703602,
"grad_norm": 2.3962173461914062,
"learning_rate": 1.5358916059579603e-05,
"loss": 0.2711,
"step": 13400
},
{
"epoch": 1.8573407202216066,
"grad_norm": 1.998607873916626,
"learning_rate": 1.534035543594265e-05,
"loss": 0.2384,
"step": 13410
},
{
"epoch": 1.8587257617728532,
"grad_norm": 1.8341917991638184,
"learning_rate": 1.5321794812305694e-05,
"loss": 0.2455,
"step": 13420
},
{
"epoch": 1.8601108033240998,
"grad_norm": 2.9478933811187744,
"learning_rate": 1.530323418866874e-05,
"loss": 0.2249,
"step": 13430
},
{
"epoch": 1.8614958448753463,
"grad_norm": 2.184619426727295,
"learning_rate": 1.528652962739548e-05,
"loss": 0.2616,
"step": 13440
},
{
"epoch": 1.8628808864265927,
"grad_norm": 1.2912342548370361,
"learning_rate": 1.5267969003758528e-05,
"loss": 0.2492,
"step": 13450
},
{
"epoch": 1.8642659279778393,
"grad_norm": 1.6502461433410645,
"learning_rate": 1.5249408380121573e-05,
"loss": 0.2248,
"step": 13460
},
{
"epoch": 1.865650969529086,
"grad_norm": 2.327885389328003,
"learning_rate": 1.5230847756484619e-05,
"loss": 0.2457,
"step": 13470
},
{
"epoch": 1.8670360110803323,
"grad_norm": 2.1829628944396973,
"learning_rate": 1.5212287132847664e-05,
"loss": 0.2727,
"step": 13480
},
{
"epoch": 1.868421052631579,
"grad_norm": 2.1626899242401123,
"learning_rate": 1.5193726509210712e-05,
"loss": 0.2715,
"step": 13490
},
{
"epoch": 1.8698060941828256,
"grad_norm": 1.8352174758911133,
"learning_rate": 1.5175165885573757e-05,
"loss": 0.2519,
"step": 13500
},
{
"epoch": 1.871191135734072,
"grad_norm": 2.3800196647644043,
"learning_rate": 1.5156605261936803e-05,
"loss": 0.2763,
"step": 13510
},
{
"epoch": 1.8725761772853184,
"grad_norm": 3.2877066135406494,
"learning_rate": 1.513804463829985e-05,
"loss": 0.2352,
"step": 13520
},
{
"epoch": 1.873961218836565,
"grad_norm": 2.6012349128723145,
"learning_rate": 1.5119484014662894e-05,
"loss": 0.2523,
"step": 13530
},
{
"epoch": 1.8753462603878117,
"grad_norm": 3.118762493133545,
"learning_rate": 1.5100923391025939e-05,
"loss": 0.2247,
"step": 13540
},
{
"epoch": 1.8767313019390581,
"grad_norm": 1.825679898262024,
"learning_rate": 1.5082362767388985e-05,
"loss": 0.2371,
"step": 13550
},
{
"epoch": 1.8781163434903048,
"grad_norm": 1.7097175121307373,
"learning_rate": 1.5063802143752032e-05,
"loss": 0.2948,
"step": 13560
},
{
"epoch": 1.8795013850415514,
"grad_norm": 2.4472036361694336,
"learning_rate": 1.5045241520115077e-05,
"loss": 0.2461,
"step": 13570
},
{
"epoch": 1.8808864265927978,
"grad_norm": 2.5957345962524414,
"learning_rate": 1.5026680896478123e-05,
"loss": 0.2742,
"step": 13580
},
{
"epoch": 1.8822714681440442,
"grad_norm": 1.9023525714874268,
"learning_rate": 1.5008120272841168e-05,
"loss": 0.2592,
"step": 13590
},
{
"epoch": 1.8836565096952909,
"grad_norm": 2.0497195720672607,
"learning_rate": 1.4989559649204215e-05,
"loss": 0.2265,
"step": 13600
},
{
"epoch": 1.8850415512465375,
"grad_norm": 1.9706491231918335,
"learning_rate": 1.4970999025567261e-05,
"loss": 0.2663,
"step": 13610
},
{
"epoch": 1.886426592797784,
"grad_norm": 2.4385011196136475,
"learning_rate": 1.4952438401930306e-05,
"loss": 0.2172,
"step": 13620
},
{
"epoch": 1.8878116343490303,
"grad_norm": 2.258354663848877,
"learning_rate": 1.493387777829335e-05,
"loss": 0.2382,
"step": 13630
},
{
"epoch": 1.889196675900277,
"grad_norm": 2.848677635192871,
"learning_rate": 1.4915317154656398e-05,
"loss": 0.2219,
"step": 13640
},
{
"epoch": 1.8905817174515236,
"grad_norm": 2.522754430770874,
"learning_rate": 1.4896756531019443e-05,
"loss": 0.2295,
"step": 13650
},
{
"epoch": 1.89196675900277,
"grad_norm": 2.012890100479126,
"learning_rate": 1.4878195907382489e-05,
"loss": 0.2387,
"step": 13660
},
{
"epoch": 1.8933518005540166,
"grad_norm": 2.1994481086730957,
"learning_rate": 1.4859635283745536e-05,
"loss": 0.2494,
"step": 13670
},
{
"epoch": 1.8947368421052633,
"grad_norm": 1.7102662324905396,
"learning_rate": 1.4841074660108581e-05,
"loss": 0.2619,
"step": 13680
},
{
"epoch": 1.8961218836565097,
"grad_norm": 2.5355281829833984,
"learning_rate": 1.4822514036471627e-05,
"loss": 0.2366,
"step": 13690
},
{
"epoch": 1.897506925207756,
"grad_norm": 1.9577605724334717,
"learning_rate": 1.4803953412834672e-05,
"loss": 0.2639,
"step": 13700
},
{
"epoch": 1.8988919667590027,
"grad_norm": 2.3210580348968506,
"learning_rate": 1.478539278919772e-05,
"loss": 0.2418,
"step": 13710
},
{
"epoch": 1.9002770083102494,
"grad_norm": 1.7135709524154663,
"learning_rate": 1.4766832165560765e-05,
"loss": 0.2498,
"step": 13720
},
{
"epoch": 1.9016620498614958,
"grad_norm": 1.5019444227218628,
"learning_rate": 1.4750127604287506e-05,
"loss": 0.2527,
"step": 13730
},
{
"epoch": 1.9030470914127424,
"grad_norm": 2.4784412384033203,
"learning_rate": 1.4733423043014245e-05,
"loss": 0.2757,
"step": 13740
},
{
"epoch": 1.904432132963989,
"grad_norm": 2.923137664794922,
"learning_rate": 1.4714862419377291e-05,
"loss": 0.224,
"step": 13750
},
{
"epoch": 1.9058171745152355,
"grad_norm": 2.0658411979675293,
"learning_rate": 1.4696301795740338e-05,
"loss": 0.218,
"step": 13760
},
{
"epoch": 1.9072022160664819,
"grad_norm": 4.4282612800598145,
"learning_rate": 1.4677741172103384e-05,
"loss": 0.2321,
"step": 13770
},
{
"epoch": 1.9085872576177285,
"grad_norm": 2.6436312198638916,
"learning_rate": 1.465918054846643e-05,
"loss": 0.2189,
"step": 13780
},
{
"epoch": 1.9099722991689752,
"grad_norm": 1.3336904048919678,
"learning_rate": 1.4640619924829476e-05,
"loss": 0.2594,
"step": 13790
},
{
"epoch": 1.9113573407202216,
"grad_norm": 1.4303447008132935,
"learning_rate": 1.4622059301192522e-05,
"loss": 0.2072,
"step": 13800
},
{
"epoch": 1.912742382271468,
"grad_norm": 2.0474507808685303,
"learning_rate": 1.4603498677555567e-05,
"loss": 0.2438,
"step": 13810
},
{
"epoch": 1.9141274238227148,
"grad_norm": 2.0859858989715576,
"learning_rate": 1.4584938053918613e-05,
"loss": 0.2408,
"step": 13820
},
{
"epoch": 1.9155124653739612,
"grad_norm": 1.888232946395874,
"learning_rate": 1.456637743028166e-05,
"loss": 0.2586,
"step": 13830
},
{
"epoch": 1.9168975069252077,
"grad_norm": 1.5036896467208862,
"learning_rate": 1.4547816806644704e-05,
"loss": 0.2402,
"step": 13840
},
{
"epoch": 1.9182825484764543,
"grad_norm": 3.7954695224761963,
"learning_rate": 1.452925618300775e-05,
"loss": 0.2273,
"step": 13850
},
{
"epoch": 1.919667590027701,
"grad_norm": 7.413731098175049,
"learning_rate": 1.4510695559370795e-05,
"loss": 0.2082,
"step": 13860
},
{
"epoch": 1.9210526315789473,
"grad_norm": 2.4324305057525635,
"learning_rate": 1.4492134935733842e-05,
"loss": 0.2318,
"step": 13870
},
{
"epoch": 1.9224376731301938,
"grad_norm": 2.1324281692504883,
"learning_rate": 1.4473574312096888e-05,
"loss": 0.2511,
"step": 13880
},
{
"epoch": 1.9238227146814404,
"grad_norm": 1.7724601030349731,
"learning_rate": 1.4455013688459933e-05,
"loss": 0.2458,
"step": 13890
},
{
"epoch": 1.925207756232687,
"grad_norm": 2.6785428524017334,
"learning_rate": 1.4436453064822979e-05,
"loss": 0.2472,
"step": 13900
},
{
"epoch": 1.9265927977839334,
"grad_norm": 2.5979104042053223,
"learning_rate": 1.4417892441186026e-05,
"loss": 0.2446,
"step": 13910
},
{
"epoch": 1.92797783933518,
"grad_norm": 1.9766836166381836,
"learning_rate": 1.4399331817549071e-05,
"loss": 0.2529,
"step": 13920
},
{
"epoch": 1.9293628808864267,
"grad_norm": 1.5978502035140991,
"learning_rate": 1.4380771193912117e-05,
"loss": 0.2544,
"step": 13930
},
{
"epoch": 1.9307479224376731,
"grad_norm": 2.787135362625122,
"learning_rate": 1.4362210570275164e-05,
"loss": 0.2549,
"step": 13940
},
{
"epoch": 1.9321329639889195,
"grad_norm": 1.7493928670883179,
"learning_rate": 1.4343649946638208e-05,
"loss": 0.2343,
"step": 13950
},
{
"epoch": 1.9335180055401662,
"grad_norm": 4.298009872436523,
"learning_rate": 1.4325089323001253e-05,
"loss": 0.2137,
"step": 13960
},
{
"epoch": 1.9349030470914128,
"grad_norm": 1.7205559015274048,
"learning_rate": 1.4306528699364299e-05,
"loss": 0.212,
"step": 13970
},
{
"epoch": 1.9362880886426592,
"grad_norm": 2.38380765914917,
"learning_rate": 1.4287968075727346e-05,
"loss": 0.2249,
"step": 13980
},
{
"epoch": 1.9376731301939059,
"grad_norm": 1.6681180000305176,
"learning_rate": 1.4269407452090392e-05,
"loss": 0.2049,
"step": 13990
},
{
"epoch": 1.9390581717451525,
"grad_norm": 1.9246506690979004,
"learning_rate": 1.4250846828453437e-05,
"loss": 0.2088,
"step": 14000
},
{
"epoch": 1.940443213296399,
"grad_norm": 1.7347540855407715,
"learning_rate": 1.4232286204816483e-05,
"loss": 0.2386,
"step": 14010
},
{
"epoch": 1.9418282548476453,
"grad_norm": 2.201300621032715,
"learning_rate": 1.421372558117953e-05,
"loss": 0.2532,
"step": 14020
},
{
"epoch": 1.943213296398892,
"grad_norm": 1.9885168075561523,
"learning_rate": 1.4195164957542575e-05,
"loss": 0.2625,
"step": 14030
},
{
"epoch": 1.9445983379501386,
"grad_norm": 2.3285436630249023,
"learning_rate": 1.417660433390562e-05,
"loss": 0.3034,
"step": 14040
},
{
"epoch": 1.945983379501385,
"grad_norm": 2.0906248092651367,
"learning_rate": 1.4158043710268665e-05,
"loss": 0.2216,
"step": 14050
},
{
"epoch": 1.9473684210526314,
"grad_norm": 4.676970958709717,
"learning_rate": 1.4139483086631712e-05,
"loss": 0.2359,
"step": 14060
},
{
"epoch": 1.9487534626038783,
"grad_norm": 1.8214576244354248,
"learning_rate": 1.4120922462994757e-05,
"loss": 0.2811,
"step": 14070
},
{
"epoch": 1.9501385041551247,
"grad_norm": 2.022029161453247,
"learning_rate": 1.4102361839357803e-05,
"loss": 0.2421,
"step": 14080
},
{
"epoch": 1.951523545706371,
"grad_norm": 1.716650366783142,
"learning_rate": 1.408380121572085e-05,
"loss": 0.2361,
"step": 14090
},
{
"epoch": 1.9529085872576177,
"grad_norm": 5.157698631286621,
"learning_rate": 1.4065240592083896e-05,
"loss": 0.2323,
"step": 14100
},
{
"epoch": 1.9542936288088644,
"grad_norm": 2.0606625080108643,
"learning_rate": 1.4046679968446941e-05,
"loss": 0.2289,
"step": 14110
},
{
"epoch": 1.9556786703601108,
"grad_norm": 1.8860923051834106,
"learning_rate": 1.4028119344809987e-05,
"loss": 0.263,
"step": 14120
},
{
"epoch": 1.9570637119113572,
"grad_norm": 1.9369615316390991,
"learning_rate": 1.4009558721173034e-05,
"loss": 0.2373,
"step": 14130
},
{
"epoch": 1.9584487534626038,
"grad_norm": 2.072380781173706,
"learning_rate": 1.399099809753608e-05,
"loss": 0.2269,
"step": 14140
},
{
"epoch": 1.9598337950138505,
"grad_norm": 1.8450301885604858,
"learning_rate": 1.3972437473899123e-05,
"loss": 0.2124,
"step": 14150
},
{
"epoch": 1.9612188365650969,
"grad_norm": 2.4834184646606445,
"learning_rate": 1.3953876850262169e-05,
"loss": 0.2375,
"step": 14160
},
{
"epoch": 1.9626038781163435,
"grad_norm": 2.2427444458007812,
"learning_rate": 1.3935316226625216e-05,
"loss": 0.2936,
"step": 14170
},
{
"epoch": 1.9639889196675901,
"grad_norm": 2.3271632194519043,
"learning_rate": 1.3916755602988261e-05,
"loss": 0.2835,
"step": 14180
},
{
"epoch": 1.9653739612188366,
"grad_norm": 1.8166530132293701,
"learning_rate": 1.3898194979351307e-05,
"loss": 0.2144,
"step": 14190
},
{
"epoch": 1.966759002770083,
"grad_norm": 2.4603664875030518,
"learning_rate": 1.3879634355714354e-05,
"loss": 0.23,
"step": 14200
},
{
"epoch": 1.9681440443213296,
"grad_norm": 1.665629506111145,
"learning_rate": 1.38610737320774e-05,
"loss": 0.2648,
"step": 14210
},
{
"epoch": 1.9695290858725762,
"grad_norm": 2.7033531665802,
"learning_rate": 1.3842513108440445e-05,
"loss": 0.2634,
"step": 14220
},
{
"epoch": 1.9709141274238227,
"grad_norm": 2.542165517807007,
"learning_rate": 1.382395248480349e-05,
"loss": 0.2244,
"step": 14230
},
{
"epoch": 1.9722991689750693,
"grad_norm": 2.3917179107666016,
"learning_rate": 1.3805391861166538e-05,
"loss": 0.2586,
"step": 14240
},
{
"epoch": 1.973684210526316,
"grad_norm": 3.0972111225128174,
"learning_rate": 1.3786831237529583e-05,
"loss": 0.2225,
"step": 14250
},
{
"epoch": 1.9750692520775623,
"grad_norm": 1.795332431793213,
"learning_rate": 1.3768270613892627e-05,
"loss": 0.2266,
"step": 14260
},
{
"epoch": 1.9764542936288088,
"grad_norm": 3.2170612812042236,
"learning_rate": 1.3749709990255673e-05,
"loss": 0.245,
"step": 14270
},
{
"epoch": 1.9778393351800554,
"grad_norm": 2.6222457885742188,
"learning_rate": 1.373114936661872e-05,
"loss": 0.2335,
"step": 14280
},
{
"epoch": 1.979224376731302,
"grad_norm": 1.6004931926727295,
"learning_rate": 1.3712588742981765e-05,
"loss": 0.2409,
"step": 14290
},
{
"epoch": 1.9806094182825484,
"grad_norm": 2.342514753341675,
"learning_rate": 1.369402811934481e-05,
"loss": 0.2446,
"step": 14300
},
{
"epoch": 1.9819944598337949,
"grad_norm": 1.6561387777328491,
"learning_rate": 1.3675467495707856e-05,
"loss": 0.2394,
"step": 14310
},
{
"epoch": 1.9833795013850417,
"grad_norm": 3.7785511016845703,
"learning_rate": 1.3656906872070903e-05,
"loss": 0.2463,
"step": 14320
},
{
"epoch": 1.9847645429362881,
"grad_norm": 1.7614632844924927,
"learning_rate": 1.3638346248433949e-05,
"loss": 0.2448,
"step": 14330
},
{
"epoch": 1.9861495844875345,
"grad_norm": 1.4015835523605347,
"learning_rate": 1.3619785624796994e-05,
"loss": 0.2339,
"step": 14340
},
{
"epoch": 1.9875346260387812,
"grad_norm": 1.7252857685089111,
"learning_rate": 1.3601225001160042e-05,
"loss": 0.2338,
"step": 14350
},
{
"epoch": 1.9889196675900278,
"grad_norm": 1.8692224025726318,
"learning_rate": 1.3582664377523085e-05,
"loss": 0.2114,
"step": 14360
},
{
"epoch": 1.9903047091412742,
"grad_norm": 1.5987521409988403,
"learning_rate": 1.3564103753886131e-05,
"loss": 0.228,
"step": 14370
},
{
"epoch": 1.9916897506925206,
"grad_norm": 2.2164413928985596,
"learning_rate": 1.3545543130249176e-05,
"loss": 0.2284,
"step": 14380
},
{
"epoch": 1.9930747922437673,
"grad_norm": 2.2460415363311768,
"learning_rate": 1.3526982506612224e-05,
"loss": 0.2315,
"step": 14390
},
{
"epoch": 1.994459833795014,
"grad_norm": 2.1210713386535645,
"learning_rate": 1.350842188297527e-05,
"loss": 0.2825,
"step": 14400
},
{
"epoch": 1.9958448753462603,
"grad_norm": 2.85060977935791,
"learning_rate": 1.3489861259338315e-05,
"loss": 0.2321,
"step": 14410
},
{
"epoch": 1.997229916897507,
"grad_norm": 1.8384126424789429,
"learning_rate": 1.347130063570136e-05,
"loss": 0.2197,
"step": 14420
},
{
"epoch": 1.9986149584487536,
"grad_norm": 1.542345404624939,
"learning_rate": 1.3452740012064407e-05,
"loss": 0.2353,
"step": 14430
},
{
"epoch": 2.0,
"grad_norm": 2.0727384090423584,
"learning_rate": 1.3434179388427453e-05,
"loss": 0.2297,
"step": 14440
},
{
"epoch": 2.0013850415512464,
"grad_norm": 1.7574317455291748,
"learning_rate": 1.3415618764790498e-05,
"loss": 0.1947,
"step": 14450
},
{
"epoch": 2.0027700831024933,
"grad_norm": 1.6832847595214844,
"learning_rate": 1.3397058141153542e-05,
"loss": 0.1959,
"step": 14460
},
{
"epoch": 2.0041551246537397,
"grad_norm": 1.722998857498169,
"learning_rate": 1.337849751751659e-05,
"loss": 0.2054,
"step": 14470
},
{
"epoch": 2.005540166204986,
"grad_norm": 2.045290231704712,
"learning_rate": 1.3359936893879635e-05,
"loss": 0.1766,
"step": 14480
},
{
"epoch": 2.0069252077562325,
"grad_norm": 1.2657092809677124,
"learning_rate": 1.334137627024268e-05,
"loss": 0.1511,
"step": 14490
},
{
"epoch": 2.0083102493074794,
"grad_norm": 1.6155598163604736,
"learning_rate": 1.3322815646605728e-05,
"loss": 0.1591,
"step": 14500
},
{
"epoch": 2.009695290858726,
"grad_norm": 3.8692007064819336,
"learning_rate": 1.3304255022968773e-05,
"loss": 0.208,
"step": 14510
},
{
"epoch": 2.011080332409972,
"grad_norm": 2.4857311248779297,
"learning_rate": 1.3285694399331819e-05,
"loss": 0.1712,
"step": 14520
},
{
"epoch": 2.012465373961219,
"grad_norm": 2.3056626319885254,
"learning_rate": 1.3267133775694864e-05,
"loss": 0.1958,
"step": 14530
},
{
"epoch": 2.0138504155124655,
"grad_norm": 2.1693966388702393,
"learning_rate": 1.3248573152057911e-05,
"loss": 0.1853,
"step": 14540
},
{
"epoch": 2.015235457063712,
"grad_norm": 4.402191162109375,
"learning_rate": 1.3230012528420957e-05,
"loss": 0.2022,
"step": 14550
},
{
"epoch": 2.0166204986149583,
"grad_norm": 1.5139687061309814,
"learning_rate": 1.3211451904784002e-05,
"loss": 0.2085,
"step": 14560
},
{
"epoch": 2.018005540166205,
"grad_norm": 2.144901990890503,
"learning_rate": 1.3192891281147046e-05,
"loss": 0.2012,
"step": 14570
},
{
"epoch": 2.0193905817174516,
"grad_norm": 3.4196605682373047,
"learning_rate": 1.3174330657510093e-05,
"loss": 0.215,
"step": 14580
},
{
"epoch": 2.020775623268698,
"grad_norm": 1.5005124807357788,
"learning_rate": 1.3155770033873139e-05,
"loss": 0.1661,
"step": 14590
},
{
"epoch": 2.0221606648199444,
"grad_norm": 1.7353737354278564,
"learning_rate": 1.3137209410236184e-05,
"loss": 0.1908,
"step": 14600
},
{
"epoch": 2.0235457063711912,
"grad_norm": 2.1631791591644287,
"learning_rate": 1.311864878659923e-05,
"loss": 0.1913,
"step": 14610
},
{
"epoch": 2.0249307479224377,
"grad_norm": 2.5623526573181152,
"learning_rate": 1.3100088162962277e-05,
"loss": 0.1889,
"step": 14620
},
{
"epoch": 2.026315789473684,
"grad_norm": 1.8216127157211304,
"learning_rate": 1.3081527539325323e-05,
"loss": 0.2089,
"step": 14630
},
{
"epoch": 2.027700831024931,
"grad_norm": 2.1311192512512207,
"learning_rate": 1.3062966915688368e-05,
"loss": 0.1938,
"step": 14640
},
{
"epoch": 2.0290858725761773,
"grad_norm": 2.107114315032959,
"learning_rate": 1.3044406292051415e-05,
"loss": 0.1792,
"step": 14650
},
{
"epoch": 2.0304709141274238,
"grad_norm": 2.94160532951355,
"learning_rate": 1.302584566841446e-05,
"loss": 0.1977,
"step": 14660
},
{
"epoch": 2.03185595567867,
"grad_norm": 2.283118724822998,
"learning_rate": 1.3007285044777505e-05,
"loss": 0.1676,
"step": 14670
},
{
"epoch": 2.033240997229917,
"grad_norm": 3.027130365371704,
"learning_rate": 1.298872442114055e-05,
"loss": 0.1662,
"step": 14680
},
{
"epoch": 2.0346260387811634,
"grad_norm": 1.537743091583252,
"learning_rate": 1.2970163797503597e-05,
"loss": 0.1946,
"step": 14690
},
{
"epoch": 2.03601108033241,
"grad_norm": 1.9911495447158813,
"learning_rate": 1.2951603173866643e-05,
"loss": 0.1961,
"step": 14700
},
{
"epoch": 2.0373961218836567,
"grad_norm": 2.0069620609283447,
"learning_rate": 1.2933042550229688e-05,
"loss": 0.217,
"step": 14710
},
{
"epoch": 2.038781163434903,
"grad_norm": 1.7512022256851196,
"learning_rate": 1.2914481926592734e-05,
"loss": 0.1887,
"step": 14720
},
{
"epoch": 2.0401662049861495,
"grad_norm": 2.5431530475616455,
"learning_rate": 1.2895921302955781e-05,
"loss": 0.18,
"step": 14730
},
{
"epoch": 2.041551246537396,
"grad_norm": 3.3946635723114014,
"learning_rate": 1.2877360679318827e-05,
"loss": 0.2296,
"step": 14740
},
{
"epoch": 2.042936288088643,
"grad_norm": 3.5196239948272705,
"learning_rate": 1.2858800055681872e-05,
"loss": 0.1829,
"step": 14750
},
{
"epoch": 2.044321329639889,
"grad_norm": 2.5390875339508057,
"learning_rate": 1.284023943204492e-05,
"loss": 0.1804,
"step": 14760
},
{
"epoch": 2.0457063711911356,
"grad_norm": 2.016263723373413,
"learning_rate": 1.2821678808407965e-05,
"loss": 0.1856,
"step": 14770
},
{
"epoch": 2.0470914127423825,
"grad_norm": 2.1532645225524902,
"learning_rate": 1.2803118184771009e-05,
"loss": 0.1719,
"step": 14780
},
{
"epoch": 2.048476454293629,
"grad_norm": 3.4412078857421875,
"learning_rate": 1.2784557561134054e-05,
"loss": 0.1574,
"step": 14790
},
{
"epoch": 2.0498614958448753,
"grad_norm": 1.2067993879318237,
"learning_rate": 1.2765996937497101e-05,
"loss": 0.1661,
"step": 14800
},
{
"epoch": 2.0512465373961217,
"grad_norm": 1.7038114070892334,
"learning_rate": 1.2747436313860147e-05,
"loss": 0.175,
"step": 14810
},
{
"epoch": 2.0526315789473686,
"grad_norm": 1.9622423648834229,
"learning_rate": 1.2728875690223192e-05,
"loss": 0.192,
"step": 14820
},
{
"epoch": 2.054016620498615,
"grad_norm": 1.806934118270874,
"learning_rate": 1.2710315066586238e-05,
"loss": 0.1814,
"step": 14830
},
{
"epoch": 2.0554016620498614,
"grad_norm": 2.2102761268615723,
"learning_rate": 1.2691754442949285e-05,
"loss": 0.2402,
"step": 14840
},
{
"epoch": 2.056786703601108,
"grad_norm": 1.8251749277114868,
"learning_rate": 1.267319381931233e-05,
"loss": 0.1764,
"step": 14850
},
{
"epoch": 2.0581717451523547,
"grad_norm": 1.8050317764282227,
"learning_rate": 1.2654633195675376e-05,
"loss": 0.1671,
"step": 14860
},
{
"epoch": 2.059556786703601,
"grad_norm": 1.6401031017303467,
"learning_rate": 1.2636072572038423e-05,
"loss": 0.1983,
"step": 14870
},
{
"epoch": 2.0609418282548475,
"grad_norm": 2.121783494949341,
"learning_rate": 1.2617511948401467e-05,
"loss": 0.1669,
"step": 14880
},
{
"epoch": 2.0623268698060944,
"grad_norm": 1.7833797931671143,
"learning_rate": 1.2598951324764512e-05,
"loss": 0.1812,
"step": 14890
},
{
"epoch": 2.063711911357341,
"grad_norm": 1.8813719749450684,
"learning_rate": 1.2580390701127558e-05,
"loss": 0.1865,
"step": 14900
},
{
"epoch": 2.065096952908587,
"grad_norm": 1.707542061805725,
"learning_rate": 1.2561830077490605e-05,
"loss": 0.2263,
"step": 14910
},
{
"epoch": 2.0664819944598336,
"grad_norm": 2.113223075866699,
"learning_rate": 1.254326945385365e-05,
"loss": 0.2098,
"step": 14920
},
{
"epoch": 2.0678670360110805,
"grad_norm": 1.8287922143936157,
"learning_rate": 1.2524708830216696e-05,
"loss": 0.1732,
"step": 14930
},
{
"epoch": 2.069252077562327,
"grad_norm": 1.389681339263916,
"learning_rate": 1.2506148206579742e-05,
"loss": 0.1981,
"step": 14940
},
{
"epoch": 2.0706371191135733,
"grad_norm": 2.0864617824554443,
"learning_rate": 1.2487587582942789e-05,
"loss": 0.1812,
"step": 14950
},
{
"epoch": 2.07202216066482,
"grad_norm": 1.4142706394195557,
"learning_rate": 1.2469026959305834e-05,
"loss": 0.1618,
"step": 14960
},
{
"epoch": 2.0734072022160666,
"grad_norm": 2.4035658836364746,
"learning_rate": 1.245046633566888e-05,
"loss": 0.2163,
"step": 14970
},
{
"epoch": 2.074792243767313,
"grad_norm": 3.36538028717041,
"learning_rate": 1.2431905712031924e-05,
"loss": 0.1659,
"step": 14980
},
{
"epoch": 2.0761772853185594,
"grad_norm": 4.023022651672363,
"learning_rate": 1.2413345088394971e-05,
"loss": 0.1942,
"step": 14990
},
{
"epoch": 2.0775623268698062,
"grad_norm": 3.7977612018585205,
"learning_rate": 1.2394784464758016e-05,
"loss": 0.1708,
"step": 15000
},
{
"epoch": 2.0782548476454292,
"eval_loss": 0.2703794538974762,
"eval_runtime": 1428.8894,
"eval_samples_per_second": 6.383,
"eval_steps_per_second": 0.798,
"step": 15005
},
{
"epoch": 2.0789473684210527,
"grad_norm": 1.9419952630996704,
"learning_rate": 1.2376223841121062e-05,
"loss": 0.1724,
"step": 15010
},
{
"epoch": 2.080332409972299,
"grad_norm": 1.5353316068649292,
"learning_rate": 1.2357663217484107e-05,
"loss": 0.1838,
"step": 15020
},
{
"epoch": 2.081717451523546,
"grad_norm": 1.7038683891296387,
"learning_rate": 1.2339102593847155e-05,
"loss": 0.1659,
"step": 15030
},
{
"epoch": 2.0831024930747923,
"grad_norm": 1.4729688167572021,
"learning_rate": 1.23205419702102e-05,
"loss": 0.1753,
"step": 15040
},
{
"epoch": 2.0844875346260388,
"grad_norm": 1.7278324365615845,
"learning_rate": 1.2301981346573246e-05,
"loss": 0.1754,
"step": 15050
},
{
"epoch": 2.085872576177285,
"grad_norm": 1.8487584590911865,
"learning_rate": 1.2283420722936293e-05,
"loss": 0.1642,
"step": 15060
},
{
"epoch": 2.087257617728532,
"grad_norm": 5.234111785888672,
"learning_rate": 1.2264860099299338e-05,
"loss": 0.2079,
"step": 15070
},
{
"epoch": 2.0886426592797784,
"grad_norm": 2.1265268325805664,
"learning_rate": 1.2246299475662384e-05,
"loss": 0.1834,
"step": 15080
},
{
"epoch": 2.090027700831025,
"grad_norm": 2.6006407737731934,
"learning_rate": 1.2227738852025428e-05,
"loss": 0.1703,
"step": 15090
},
{
"epoch": 2.0914127423822713,
"grad_norm": 1.6003869771957397,
"learning_rate": 1.2209178228388475e-05,
"loss": 0.1881,
"step": 15100
},
{
"epoch": 2.092797783933518,
"grad_norm": 1.5022588968276978,
"learning_rate": 1.219061760475152e-05,
"loss": 0.1654,
"step": 15110
},
{
"epoch": 2.0941828254847645,
"grad_norm": 1.9021573066711426,
"learning_rate": 1.2172056981114566e-05,
"loss": 0.1449,
"step": 15120
},
{
"epoch": 2.095567867036011,
"grad_norm": 2.192850112915039,
"learning_rate": 1.2153496357477611e-05,
"loss": 0.148,
"step": 15130
},
{
"epoch": 2.096952908587258,
"grad_norm": 2.480792284011841,
"learning_rate": 1.2134935733840659e-05,
"loss": 0.2227,
"step": 15140
},
{
"epoch": 2.098337950138504,
"grad_norm": 1.867102026939392,
"learning_rate": 1.2116375110203704e-05,
"loss": 0.2002,
"step": 15150
},
{
"epoch": 2.0997229916897506,
"grad_norm": 2.851119041442871,
"learning_rate": 1.209781448656675e-05,
"loss": 0.1931,
"step": 15160
},
{
"epoch": 2.101108033240997,
"grad_norm": 2.097811698913574,
"learning_rate": 1.2079253862929797e-05,
"loss": 0.1952,
"step": 15170
},
{
"epoch": 2.102493074792244,
"grad_norm": 1.8236339092254639,
"learning_rate": 1.2060693239292842e-05,
"loss": 0.1726,
"step": 15180
},
{
"epoch": 2.1038781163434903,
"grad_norm": 2.647979259490967,
"learning_rate": 1.2042132615655886e-05,
"loss": 0.18,
"step": 15190
},
{
"epoch": 2.1052631578947367,
"grad_norm": 2.0955395698547363,
"learning_rate": 1.2023571992018932e-05,
"loss": 0.2037,
"step": 15200
},
{
"epoch": 2.1066481994459836,
"grad_norm": 2.227377414703369,
"learning_rate": 1.2005011368381979e-05,
"loss": 0.2027,
"step": 15210
},
{
"epoch": 2.10803324099723,
"grad_norm": 1.8922470808029175,
"learning_rate": 1.1986450744745024e-05,
"loss": 0.2151,
"step": 15220
},
{
"epoch": 2.1094182825484764,
"grad_norm": 1.543190360069275,
"learning_rate": 1.196789012110807e-05,
"loss": 0.1816,
"step": 15230
},
{
"epoch": 2.110803324099723,
"grad_norm": 1.440812110900879,
"learning_rate": 1.1949329497471115e-05,
"loss": 0.1457,
"step": 15240
},
{
"epoch": 2.1121883656509697,
"grad_norm": 1.3764451742172241,
"learning_rate": 1.1930768873834163e-05,
"loss": 0.2024,
"step": 15250
},
{
"epoch": 2.113573407202216,
"grad_norm": 1.9469165802001953,
"learning_rate": 1.1912208250197208e-05,
"loss": 0.1938,
"step": 15260
},
{
"epoch": 2.1149584487534625,
"grad_norm": 2.06449818611145,
"learning_rate": 1.1893647626560254e-05,
"loss": 0.1746,
"step": 15270
},
{
"epoch": 2.1163434903047094,
"grad_norm": 2.1765999794006348,
"learning_rate": 1.18750870029233e-05,
"loss": 0.1746,
"step": 15280
},
{
"epoch": 2.1177285318559558,
"grad_norm": 1.6806100606918335,
"learning_rate": 1.1856526379286346e-05,
"loss": 0.146,
"step": 15290
},
{
"epoch": 2.119113573407202,
"grad_norm": 1.8645600080490112,
"learning_rate": 1.183796575564939e-05,
"loss": 0.1805,
"step": 15300
},
{
"epoch": 2.1204986149584486,
"grad_norm": 1.9853228330612183,
"learning_rate": 1.1819405132012436e-05,
"loss": 0.1881,
"step": 15310
},
{
"epoch": 2.1218836565096955,
"grad_norm": 2.438157796859741,
"learning_rate": 1.1800844508375481e-05,
"loss": 0.1605,
"step": 15320
},
{
"epoch": 2.123268698060942,
"grad_norm": 1.974244475364685,
"learning_rate": 1.1782283884738528e-05,
"loss": 0.2067,
"step": 15330
},
{
"epoch": 2.1246537396121883,
"grad_norm": 2.455505847930908,
"learning_rate": 1.1763723261101574e-05,
"loss": 0.2004,
"step": 15340
},
{
"epoch": 2.1260387811634347,
"grad_norm": 1.9168258905410767,
"learning_rate": 1.174516263746462e-05,
"loss": 0.2129,
"step": 15350
},
{
"epoch": 2.1274238227146816,
"grad_norm": 1.6309889554977417,
"learning_rate": 1.1726602013827666e-05,
"loss": 0.1975,
"step": 15360
},
{
"epoch": 2.128808864265928,
"grad_norm": 3.4665346145629883,
"learning_rate": 1.1708041390190712e-05,
"loss": 0.2004,
"step": 15370
},
{
"epoch": 2.1301939058171744,
"grad_norm": 1.8809685707092285,
"learning_rate": 1.1689480766553757e-05,
"loss": 0.1998,
"step": 15380
},
{
"epoch": 2.1315789473684212,
"grad_norm": 2.7498867511749268,
"learning_rate": 1.1670920142916805e-05,
"loss": 0.1567,
"step": 15390
},
{
"epoch": 2.1329639889196677,
"grad_norm": 2.1605582237243652,
"learning_rate": 1.1652359519279848e-05,
"loss": 0.2108,
"step": 15400
},
{
"epoch": 2.134349030470914,
"grad_norm": 1.3878306150436401,
"learning_rate": 1.1633798895642894e-05,
"loss": 0.1653,
"step": 15410
},
{
"epoch": 2.1357340720221605,
"grad_norm": 1.9377297163009644,
"learning_rate": 1.161523827200594e-05,
"loss": 0.1893,
"step": 15420
},
{
"epoch": 2.1371191135734073,
"grad_norm": 1.818156361579895,
"learning_rate": 1.1596677648368985e-05,
"loss": 0.1557,
"step": 15430
},
{
"epoch": 2.1385041551246537,
"grad_norm": 2.047635555267334,
"learning_rate": 1.1578117024732032e-05,
"loss": 0.2056,
"step": 15440
},
{
"epoch": 2.1398891966759,
"grad_norm": 1.8148167133331299,
"learning_rate": 1.1559556401095078e-05,
"loss": 0.204,
"step": 15450
},
{
"epoch": 2.141274238227147,
"grad_norm": 1.9170321226119995,
"learning_rate": 1.1540995777458123e-05,
"loss": 0.1645,
"step": 15460
},
{
"epoch": 2.1426592797783934,
"grad_norm": 2.07987117767334,
"learning_rate": 1.152243515382117e-05,
"loss": 0.1828,
"step": 15470
},
{
"epoch": 2.14404432132964,
"grad_norm": 1.5561580657958984,
"learning_rate": 1.1503874530184216e-05,
"loss": 0.1782,
"step": 15480
},
{
"epoch": 2.1454293628808863,
"grad_norm": 1.8059701919555664,
"learning_rate": 1.1485313906547261e-05,
"loss": 0.209,
"step": 15490
},
{
"epoch": 2.146814404432133,
"grad_norm": 1.7873286008834839,
"learning_rate": 1.1466753282910305e-05,
"loss": 0.1892,
"step": 15500
},
{
"epoch": 2.1481994459833795,
"grad_norm": 2.170375347137451,
"learning_rate": 1.1448192659273352e-05,
"loss": 0.1714,
"step": 15510
},
{
"epoch": 2.149584487534626,
"grad_norm": 1.824212908744812,
"learning_rate": 1.1429632035636398e-05,
"loss": 0.1742,
"step": 15520
},
{
"epoch": 2.150969529085873,
"grad_norm": 3.1203479766845703,
"learning_rate": 1.1411071411999443e-05,
"loss": 0.2095,
"step": 15530
},
{
"epoch": 2.152354570637119,
"grad_norm": 2.1490399837493896,
"learning_rate": 1.1392510788362489e-05,
"loss": 0.1543,
"step": 15540
},
{
"epoch": 2.1537396121883656,
"grad_norm": 2.8778560161590576,
"learning_rate": 1.1373950164725536e-05,
"loss": 0.19,
"step": 15550
},
{
"epoch": 2.155124653739612,
"grad_norm": 2.4876325130462646,
"learning_rate": 1.1355389541088582e-05,
"loss": 0.2228,
"step": 15560
},
{
"epoch": 2.156509695290859,
"grad_norm": 1.8188594579696655,
"learning_rate": 1.1336828917451627e-05,
"loss": 0.1879,
"step": 15570
},
{
"epoch": 2.1578947368421053,
"grad_norm": 2.6546733379364014,
"learning_rate": 1.1318268293814674e-05,
"loss": 0.1667,
"step": 15580
},
{
"epoch": 2.1592797783933517,
"grad_norm": 1.4731812477111816,
"learning_rate": 1.129970767017772e-05,
"loss": 0.1701,
"step": 15590
},
{
"epoch": 2.160664819944598,
"grad_norm": 1.7102782726287842,
"learning_rate": 1.1281147046540765e-05,
"loss": 0.1939,
"step": 15600
},
{
"epoch": 2.162049861495845,
"grad_norm": 1.9763939380645752,
"learning_rate": 1.126258642290381e-05,
"loss": 0.1957,
"step": 15610
},
{
"epoch": 2.1634349030470914,
"grad_norm": 2.430368661880493,
"learning_rate": 1.1244025799266856e-05,
"loss": 0.2212,
"step": 15620
},
{
"epoch": 2.164819944598338,
"grad_norm": 2.098733425140381,
"learning_rate": 1.1225465175629902e-05,
"loss": 0.1502,
"step": 15630
},
{
"epoch": 2.1662049861495847,
"grad_norm": 1.9996839761734009,
"learning_rate": 1.1206904551992947e-05,
"loss": 0.2054,
"step": 15640
},
{
"epoch": 2.167590027700831,
"grad_norm": 2.2129204273223877,
"learning_rate": 1.1188343928355993e-05,
"loss": 0.1923,
"step": 15650
},
{
"epoch": 2.1689750692520775,
"grad_norm": 2.9811301231384277,
"learning_rate": 1.116978330471904e-05,
"loss": 0.1995,
"step": 15660
},
{
"epoch": 2.170360110803324,
"grad_norm": 1.7352439165115356,
"learning_rate": 1.1151222681082086e-05,
"loss": 0.1815,
"step": 15670
},
{
"epoch": 2.1717451523545708,
"grad_norm": 2.3152153491973877,
"learning_rate": 1.1132662057445131e-05,
"loss": 0.2229,
"step": 15680
},
{
"epoch": 2.173130193905817,
"grad_norm": 2.018587350845337,
"learning_rate": 1.1114101433808178e-05,
"loss": 0.1789,
"step": 15690
},
{
"epoch": 2.1745152354570636,
"grad_norm": 2.000908613204956,
"learning_rate": 1.1095540810171224e-05,
"loss": 0.1999,
"step": 15700
},
{
"epoch": 2.1759002770083105,
"grad_norm": 2.4677839279174805,
"learning_rate": 1.1076980186534268e-05,
"loss": 0.1765,
"step": 15710
},
{
"epoch": 2.177285318559557,
"grad_norm": 1.403136134147644,
"learning_rate": 1.1058419562897313e-05,
"loss": 0.1749,
"step": 15720
},
{
"epoch": 2.1786703601108033,
"grad_norm": 1.935598611831665,
"learning_rate": 1.1039858939260359e-05,
"loss": 0.1506,
"step": 15730
},
{
"epoch": 2.1800554016620497,
"grad_norm": 2.197678565979004,
"learning_rate": 1.1021298315623406e-05,
"loss": 0.1765,
"step": 15740
},
{
"epoch": 2.1814404432132966,
"grad_norm": 1.7003947496414185,
"learning_rate": 1.1002737691986451e-05,
"loss": 0.1747,
"step": 15750
},
{
"epoch": 2.182825484764543,
"grad_norm": 1.7991876602172852,
"learning_rate": 1.0984177068349497e-05,
"loss": 0.178,
"step": 15760
},
{
"epoch": 2.1842105263157894,
"grad_norm": 3.0078437328338623,
"learning_rate": 1.0965616444712544e-05,
"loss": 0.164,
"step": 15770
},
{
"epoch": 2.1855955678670362,
"grad_norm": 1.8674951791763306,
"learning_rate": 1.094705582107559e-05,
"loss": 0.2086,
"step": 15780
},
{
"epoch": 2.1869806094182827,
"grad_norm": 1.6032930612564087,
"learning_rate": 1.0928495197438635e-05,
"loss": 0.151,
"step": 15790
},
{
"epoch": 2.188365650969529,
"grad_norm": 2.2988972663879395,
"learning_rate": 1.0909934573801682e-05,
"loss": 0.1991,
"step": 15800
},
{
"epoch": 2.1897506925207755,
"grad_norm": 2.0465915203094482,
"learning_rate": 1.0891373950164726e-05,
"loss": 0.1855,
"step": 15810
},
{
"epoch": 2.1911357340720223,
"grad_norm": 1.6645904779434204,
"learning_rate": 1.0874669388891467e-05,
"loss": 0.1907,
"step": 15820
},
{
"epoch": 2.1925207756232687,
"grad_norm": 1.4574722051620483,
"learning_rate": 1.0856108765254514e-05,
"loss": 0.1808,
"step": 15830
},
{
"epoch": 2.193905817174515,
"grad_norm": 3.1846442222595215,
"learning_rate": 1.083754814161756e-05,
"loss": 0.2199,
"step": 15840
},
{
"epoch": 2.1952908587257616,
"grad_norm": 2.510326385498047,
"learning_rate": 1.0818987517980605e-05,
"loss": 0.1804,
"step": 15850
},
{
"epoch": 2.1966759002770084,
"grad_norm": 1.9207181930541992,
"learning_rate": 1.0800426894343653e-05,
"loss": 0.1802,
"step": 15860
},
{
"epoch": 2.198060941828255,
"grad_norm": 1.5893478393554688,
"learning_rate": 1.0781866270706696e-05,
"loss": 0.1877,
"step": 15870
},
{
"epoch": 2.1994459833795013,
"grad_norm": 2.8297579288482666,
"learning_rate": 1.0763305647069742e-05,
"loss": 0.1832,
"step": 15880
},
{
"epoch": 2.200831024930748,
"grad_norm": 2.0689809322357178,
"learning_rate": 1.0744745023432787e-05,
"loss": 0.1696,
"step": 15890
},
{
"epoch": 2.2022160664819945,
"grad_norm": 2.195650577545166,
"learning_rate": 1.0726184399795833e-05,
"loss": 0.1962,
"step": 15900
},
{
"epoch": 2.203601108033241,
"grad_norm": 2.4047739505767822,
"learning_rate": 1.070762377615888e-05,
"loss": 0.1616,
"step": 15910
},
{
"epoch": 2.2049861495844874,
"grad_norm": 2.298598289489746,
"learning_rate": 1.0689063152521926e-05,
"loss": 0.1956,
"step": 15920
},
{
"epoch": 2.206371191135734,
"grad_norm": 2.514930009841919,
"learning_rate": 1.0670502528884971e-05,
"loss": 0.192,
"step": 15930
},
{
"epoch": 2.2077562326869806,
"grad_norm": 2.1210293769836426,
"learning_rate": 1.0651941905248018e-05,
"loss": 0.1501,
"step": 15940
},
{
"epoch": 2.209141274238227,
"grad_norm": 1.428801417350769,
"learning_rate": 1.0633381281611064e-05,
"loss": 0.1822,
"step": 15950
},
{
"epoch": 2.2105263157894735,
"grad_norm": 1.8047534227371216,
"learning_rate": 1.061482065797411e-05,
"loss": 0.1616,
"step": 15960
},
{
"epoch": 2.2119113573407203,
"grad_norm": 1.586500644683838,
"learning_rate": 1.0596260034337153e-05,
"loss": 0.1567,
"step": 15970
},
{
"epoch": 2.2132963988919667,
"grad_norm": 1.429770588874817,
"learning_rate": 1.05776994107002e-05,
"loss": 0.1873,
"step": 15980
},
{
"epoch": 2.214681440443213,
"grad_norm": 3.8475210666656494,
"learning_rate": 1.0559138787063246e-05,
"loss": 0.1759,
"step": 15990
},
{
"epoch": 2.21606648199446,
"grad_norm": 2.3511464595794678,
"learning_rate": 1.0540578163426291e-05,
"loss": 0.1683,
"step": 16000
},
{
"epoch": 2.2174515235457064,
"grad_norm": 2.5054216384887695,
"learning_rate": 1.0522017539789337e-05,
"loss": 0.1761,
"step": 16010
},
{
"epoch": 2.218836565096953,
"grad_norm": 1.5130455493927002,
"learning_rate": 1.0503456916152384e-05,
"loss": 0.1769,
"step": 16020
},
{
"epoch": 2.2202216066481997,
"grad_norm": 2.0365564823150635,
"learning_rate": 1.048489629251543e-05,
"loss": 0.1929,
"step": 16030
},
{
"epoch": 2.221606648199446,
"grad_norm": 2.6106247901916504,
"learning_rate": 1.0466335668878475e-05,
"loss": 0.1926,
"step": 16040
},
{
"epoch": 2.2229916897506925,
"grad_norm": 1.5493723154067993,
"learning_rate": 1.0447775045241522e-05,
"loss": 0.177,
"step": 16050
},
{
"epoch": 2.224376731301939,
"grad_norm": 2.3234448432922363,
"learning_rate": 1.0429214421604568e-05,
"loss": 0.2127,
"step": 16060
},
{
"epoch": 2.2257617728531858,
"grad_norm": 2.54538893699646,
"learning_rate": 1.0410653797967613e-05,
"loss": 0.1878,
"step": 16070
},
{
"epoch": 2.227146814404432,
"grad_norm": 1.641268014907837,
"learning_rate": 1.0392093174330657e-05,
"loss": 0.1522,
"step": 16080
},
{
"epoch": 2.2285318559556786,
"grad_norm": 2.013471841812134,
"learning_rate": 1.0373532550693704e-05,
"loss": 0.1901,
"step": 16090
},
{
"epoch": 2.229916897506925,
"grad_norm": 2.284034490585327,
"learning_rate": 1.035497192705675e-05,
"loss": 0.1998,
"step": 16100
},
{
"epoch": 2.231301939058172,
"grad_norm": 1.8952127695083618,
"learning_rate": 1.0336411303419795e-05,
"loss": 0.1582,
"step": 16110
},
{
"epoch": 2.2326869806094183,
"grad_norm": 2.218730926513672,
"learning_rate": 1.031785067978284e-05,
"loss": 0.1871,
"step": 16120
},
{
"epoch": 2.2340720221606647,
"grad_norm": 1.4592667818069458,
"learning_rate": 1.0299290056145888e-05,
"loss": 0.1613,
"step": 16130
},
{
"epoch": 2.2354570637119116,
"grad_norm": 2.5558269023895264,
"learning_rate": 1.0280729432508934e-05,
"loss": 0.1907,
"step": 16140
},
{
"epoch": 2.236842105263158,
"grad_norm": 2.4344468116760254,
"learning_rate": 1.0262168808871979e-05,
"loss": 0.1599,
"step": 16150
},
{
"epoch": 2.2382271468144044,
"grad_norm": 1.4910821914672852,
"learning_rate": 1.0243608185235026e-05,
"loss": 0.1576,
"step": 16160
},
{
"epoch": 2.239612188365651,
"grad_norm": 2.0156784057617188,
"learning_rate": 1.0225047561598072e-05,
"loss": 0.191,
"step": 16170
},
{
"epoch": 2.2409972299168976,
"grad_norm": 3.575056314468384,
"learning_rate": 1.0206486937961116e-05,
"loss": 0.2032,
"step": 16180
},
{
"epoch": 2.242382271468144,
"grad_norm": 2.0502638816833496,
"learning_rate": 1.0187926314324161e-05,
"loss": 0.1915,
"step": 16190
},
{
"epoch": 2.2437673130193905,
"grad_norm": 1.5162267684936523,
"learning_rate": 1.0169365690687207e-05,
"loss": 0.1873,
"step": 16200
},
{
"epoch": 2.245152354570637,
"grad_norm": 2.2193379402160645,
"learning_rate": 1.0150805067050254e-05,
"loss": 0.1924,
"step": 16210
},
{
"epoch": 2.2465373961218837,
"grad_norm": 2.0222744941711426,
"learning_rate": 1.01322444434133e-05,
"loss": 0.1615,
"step": 16220
},
{
"epoch": 2.24792243767313,
"grad_norm": 1.7618517875671387,
"learning_rate": 1.0113683819776345e-05,
"loss": 0.1622,
"step": 16230
},
{
"epoch": 2.2493074792243766,
"grad_norm": 1.4167286157608032,
"learning_rate": 1.0095123196139392e-05,
"loss": 0.1759,
"step": 16240
},
{
"epoch": 2.2506925207756234,
"grad_norm": 1.6987353563308716,
"learning_rate": 1.0076562572502437e-05,
"loss": 0.1722,
"step": 16250
},
{
"epoch": 2.25207756232687,
"grad_norm": 1.8066134452819824,
"learning_rate": 1.0058001948865483e-05,
"loss": 0.1547,
"step": 16260
},
{
"epoch": 2.2534626038781163,
"grad_norm": 1.7205597162246704,
"learning_rate": 1.003944132522853e-05,
"loss": 0.1733,
"step": 16270
},
{
"epoch": 2.254847645429363,
"grad_norm": 1.9057214260101318,
"learning_rate": 1.0020880701591574e-05,
"loss": 0.1933,
"step": 16280
},
{
"epoch": 2.2562326869806095,
"grad_norm": 1.8358714580535889,
"learning_rate": 1.000232007795462e-05,
"loss": 0.1775,
"step": 16290
},
{
"epoch": 2.257617728531856,
"grad_norm": 3.4662563800811768,
"learning_rate": 9.983759454317667e-06,
"loss": 0.1976,
"step": 16300
},
{
"epoch": 2.2590027700831024,
"grad_norm": 2.5907483100891113,
"learning_rate": 9.96519883068071e-06,
"loss": 0.1674,
"step": 16310
},
{
"epoch": 2.260387811634349,
"grad_norm": 2.281738042831421,
"learning_rate": 9.946638207043758e-06,
"loss": 0.1941,
"step": 16320
},
{
"epoch": 2.2617728531855956,
"grad_norm": 2.349745035171509,
"learning_rate": 9.928077583406803e-06,
"loss": 0.1961,
"step": 16330
},
{
"epoch": 2.263157894736842,
"grad_norm": 1.396642804145813,
"learning_rate": 9.909516959769849e-06,
"loss": 0.1475,
"step": 16340
},
{
"epoch": 2.2645429362880884,
"grad_norm": 1.8703711032867432,
"learning_rate": 9.892812398496592e-06,
"loss": 0.1619,
"step": 16350
},
{
"epoch": 2.2659279778393353,
"grad_norm": 2.279014825820923,
"learning_rate": 9.874251774859635e-06,
"loss": 0.1788,
"step": 16360
},
{
"epoch": 2.2673130193905817,
"grad_norm": 1.6533187627792358,
"learning_rate": 9.855691151222681e-06,
"loss": 0.1947,
"step": 16370
},
{
"epoch": 2.268698060941828,
"grad_norm": 1.9308675527572632,
"learning_rate": 9.837130527585728e-06,
"loss": 0.1818,
"step": 16380
},
{
"epoch": 2.270083102493075,
"grad_norm": 2.1606502532958984,
"learning_rate": 9.818569903948774e-06,
"loss": 0.1866,
"step": 16390
},
{
"epoch": 2.2714681440443214,
"grad_norm": 2.4330480098724365,
"learning_rate": 9.800009280311819e-06,
"loss": 0.1811,
"step": 16400
},
{
"epoch": 2.272853185595568,
"grad_norm": 2.237210512161255,
"learning_rate": 9.781448656674865e-06,
"loss": 0.16,
"step": 16410
},
{
"epoch": 2.2742382271468142,
"grad_norm": 5.955811977386475,
"learning_rate": 9.76288803303791e-06,
"loss": 0.1702,
"step": 16420
},
{
"epoch": 2.275623268698061,
"grad_norm": 1.7061985731124878,
"learning_rate": 9.744327409400957e-06,
"loss": 0.1758,
"step": 16430
},
{
"epoch": 2.2770083102493075,
"grad_norm": 3.15761399269104,
"learning_rate": 9.725766785764003e-06,
"loss": 0.1877,
"step": 16440
},
{
"epoch": 2.278393351800554,
"grad_norm": 1.7660126686096191,
"learning_rate": 9.707206162127048e-06,
"loss": 0.1702,
"step": 16450
},
{
"epoch": 2.2797783933518003,
"grad_norm": 2.0085220336914062,
"learning_rate": 9.688645538490094e-06,
"loss": 0.176,
"step": 16460
},
{
"epoch": 2.281163434903047,
"grad_norm": 2.159771203994751,
"learning_rate": 9.67008491485314e-06,
"loss": 0.1999,
"step": 16470
},
{
"epoch": 2.2825484764542936,
"grad_norm": 2.3023860454559326,
"learning_rate": 9.651524291216185e-06,
"loss": 0.1575,
"step": 16480
},
{
"epoch": 2.28393351800554,
"grad_norm": 1.5170584917068481,
"learning_rate": 9.632963667579232e-06,
"loss": 0.1908,
"step": 16490
},
{
"epoch": 2.285318559556787,
"grad_norm": 2.998106002807617,
"learning_rate": 9.614403043942278e-06,
"loss": 0.1824,
"step": 16500
},
{
"epoch": 2.2867036011080333,
"grad_norm": 2.6079273223876953,
"learning_rate": 9.595842420305323e-06,
"loss": 0.1978,
"step": 16510
},
{
"epoch": 2.2880886426592797,
"grad_norm": 2.0469090938568115,
"learning_rate": 9.577281796668369e-06,
"loss": 0.181,
"step": 16520
},
{
"epoch": 2.2894736842105265,
"grad_norm": 1.8900119066238403,
"learning_rate": 9.558721173031414e-06,
"loss": 0.1852,
"step": 16530
},
{
"epoch": 2.290858725761773,
"grad_norm": 1.3825252056121826,
"learning_rate": 9.540160549394461e-06,
"loss": 0.2009,
"step": 16540
},
{
"epoch": 2.2922437673130194,
"grad_norm": 2.3342790603637695,
"learning_rate": 9.521599925757507e-06,
"loss": 0.208,
"step": 16550
},
{
"epoch": 2.293628808864266,
"grad_norm": 3.8766579627990723,
"learning_rate": 9.503039302120552e-06,
"loss": 0.1838,
"step": 16560
},
{
"epoch": 2.2950138504155126,
"grad_norm": 2.5892879962921143,
"learning_rate": 9.484478678483598e-06,
"loss": 0.1826,
"step": 16570
},
{
"epoch": 2.296398891966759,
"grad_norm": 1.9301683902740479,
"learning_rate": 9.465918054846643e-06,
"loss": 0.1768,
"step": 16580
},
{
"epoch": 2.2977839335180055,
"grad_norm": 1.7350757122039795,
"learning_rate": 9.447357431209689e-06,
"loss": 0.1769,
"step": 16590
},
{
"epoch": 2.299168975069252,
"grad_norm": 2.5931735038757324,
"learning_rate": 9.428796807572736e-06,
"loss": 0.1934,
"step": 16600
},
{
"epoch": 2.3005540166204987,
"grad_norm": 1.9637055397033691,
"learning_rate": 9.410236183935781e-06,
"loss": 0.1603,
"step": 16610
},
{
"epoch": 2.301939058171745,
"grad_norm": 2.185250997543335,
"learning_rate": 9.391675560298827e-06,
"loss": 0.1741,
"step": 16620
},
{
"epoch": 2.3033240997229916,
"grad_norm": 2.1665894985198975,
"learning_rate": 9.373114936661872e-06,
"loss": 0.1598,
"step": 16630
},
{
"epoch": 2.3047091412742384,
"grad_norm": 2.147671937942505,
"learning_rate": 9.354554313024918e-06,
"loss": 0.1916,
"step": 16640
},
{
"epoch": 2.306094182825485,
"grad_norm": 2.141869068145752,
"learning_rate": 9.335993689387965e-06,
"loss": 0.16,
"step": 16650
},
{
"epoch": 2.3074792243767313,
"grad_norm": 2.271667957305908,
"learning_rate": 9.31743306575101e-06,
"loss": 0.1961,
"step": 16660
},
{
"epoch": 2.3088642659279777,
"grad_norm": 2.0384247303009033,
"learning_rate": 9.298872442114054e-06,
"loss": 0.1948,
"step": 16670
},
{
"epoch": 2.3102493074792245,
"grad_norm": 1.5816787481307983,
"learning_rate": 9.280311818477102e-06,
"loss": 0.1827,
"step": 16680
},
{
"epoch": 2.311634349030471,
"grad_norm": 1.7221791744232178,
"learning_rate": 9.261751194840147e-06,
"loss": 0.1798,
"step": 16690
},
{
"epoch": 2.3130193905817173,
"grad_norm": 1.7499399185180664,
"learning_rate": 9.243190571203193e-06,
"loss": 0.1946,
"step": 16700
},
{
"epoch": 2.3144044321329638,
"grad_norm": 2.3477981090545654,
"learning_rate": 9.22462994756624e-06,
"loss": 0.1628,
"step": 16710
},
{
"epoch": 2.3157894736842106,
"grad_norm": 2.5354487895965576,
"learning_rate": 9.206069323929285e-06,
"loss": 0.1748,
"step": 16720
},
{
"epoch": 2.317174515235457,
"grad_norm": 2.410682201385498,
"learning_rate": 9.187508700292331e-06,
"loss": 0.1858,
"step": 16730
},
{
"epoch": 2.3185595567867034,
"grad_norm": 2.190929651260376,
"learning_rate": 9.168948076655376e-06,
"loss": 0.1744,
"step": 16740
},
{
"epoch": 2.3199445983379503,
"grad_norm": 2.5295703411102295,
"learning_rate": 9.150387453018422e-06,
"loss": 0.1855,
"step": 16750
},
{
"epoch": 2.3213296398891967,
"grad_norm": 1.313277244567871,
"learning_rate": 9.133682891745163e-06,
"loss": 0.1862,
"step": 16760
},
{
"epoch": 2.322714681440443,
"grad_norm": 2.6606435775756836,
"learning_rate": 9.11512226810821e-06,
"loss": 0.1474,
"step": 16770
},
{
"epoch": 2.32409972299169,
"grad_norm": 2.0530002117156982,
"learning_rate": 9.096561644471254e-06,
"loss": 0.1808,
"step": 16780
},
{
"epoch": 2.3254847645429364,
"grad_norm": 1.8997056484222412,
"learning_rate": 9.078001020834301e-06,
"loss": 0.1928,
"step": 16790
},
{
"epoch": 2.326869806094183,
"grad_norm": 2.367507219314575,
"learning_rate": 9.059440397197347e-06,
"loss": 0.1783,
"step": 16800
},
{
"epoch": 2.3282548476454292,
"grad_norm": 1.7851054668426514,
"learning_rate": 9.040879773560392e-06,
"loss": 0.1978,
"step": 16810
},
{
"epoch": 2.329639889196676,
"grad_norm": 1.6374269723892212,
"learning_rate": 9.022319149923438e-06,
"loss": 0.1696,
"step": 16820
},
{
"epoch": 2.3310249307479225,
"grad_norm": 1.65655517578125,
"learning_rate": 9.003758526286483e-06,
"loss": 0.1911,
"step": 16830
},
{
"epoch": 2.332409972299169,
"grad_norm": 1.8260128498077393,
"learning_rate": 8.985197902649529e-06,
"loss": 0.1993,
"step": 16840
},
{
"epoch": 2.3337950138504153,
"grad_norm": 1.7871785163879395,
"learning_rate": 8.966637279012576e-06,
"loss": 0.1881,
"step": 16850
},
{
"epoch": 2.335180055401662,
"grad_norm": 2.481699228286743,
"learning_rate": 8.948076655375621e-06,
"loss": 0.1825,
"step": 16860
},
{
"epoch": 2.3365650969529086,
"grad_norm": 1.8377317190170288,
"learning_rate": 8.929516031738667e-06,
"loss": 0.1687,
"step": 16870
},
{
"epoch": 2.337950138504155,
"grad_norm": 1.7560187578201294,
"learning_rate": 8.910955408101712e-06,
"loss": 0.1762,
"step": 16880
},
{
"epoch": 2.339335180055402,
"grad_norm": 2.3741097450256348,
"learning_rate": 8.892394784464758e-06,
"loss": 0.1885,
"step": 16890
},
{
"epoch": 2.3407202216066483,
"grad_norm": 2.3812923431396484,
"learning_rate": 8.873834160827805e-06,
"loss": 0.203,
"step": 16900
},
{
"epoch": 2.3421052631578947,
"grad_norm": 1.9939693212509155,
"learning_rate": 8.85527353719085e-06,
"loss": 0.2002,
"step": 16910
},
{
"epoch": 2.343490304709141,
"grad_norm": 2.0856332778930664,
"learning_rate": 8.836712913553896e-06,
"loss": 0.2113,
"step": 16920
},
{
"epoch": 2.344875346260388,
"grad_norm": 1.7179794311523438,
"learning_rate": 8.818152289916942e-06,
"loss": 0.1582,
"step": 16930
},
{
"epoch": 2.3462603878116344,
"grad_norm": 2.0003726482391357,
"learning_rate": 8.799591666279987e-06,
"loss": 0.2333,
"step": 16940
},
{
"epoch": 2.347645429362881,
"grad_norm": 1.780743956565857,
"learning_rate": 8.781031042643033e-06,
"loss": 0.1748,
"step": 16950
},
{
"epoch": 2.349030470914127,
"grad_norm": 3.4279558658599854,
"learning_rate": 8.76247041900608e-06,
"loss": 0.1969,
"step": 16960
},
{
"epoch": 2.350415512465374,
"grad_norm": 1.674522042274475,
"learning_rate": 8.743909795369125e-06,
"loss": 0.1952,
"step": 16970
},
{
"epoch": 2.3518005540166205,
"grad_norm": 1.9975336790084839,
"learning_rate": 8.725349171732171e-06,
"loss": 0.1838,
"step": 16980
},
{
"epoch": 2.353185595567867,
"grad_norm": 2.15136456489563,
"learning_rate": 8.706788548095216e-06,
"loss": 0.1835,
"step": 16990
},
{
"epoch": 2.3545706371191137,
"grad_norm": 1.4658695459365845,
"learning_rate": 8.688227924458262e-06,
"loss": 0.1541,
"step": 17000
},
{
"epoch": 2.35595567867036,
"grad_norm": 2.032360792160034,
"learning_rate": 8.669667300821309e-06,
"loss": 0.1694,
"step": 17010
},
{
"epoch": 2.3573407202216066,
"grad_norm": 2.0457422733306885,
"learning_rate": 8.651106677184355e-06,
"loss": 0.1971,
"step": 17020
},
{
"epoch": 2.3587257617728534,
"grad_norm": 1.985001564025879,
"learning_rate": 8.6325460535474e-06,
"loss": 0.1649,
"step": 17030
},
{
"epoch": 2.3601108033241,
"grad_norm": 1.8881748914718628,
"learning_rate": 8.613985429910446e-06,
"loss": 0.1711,
"step": 17040
},
{
"epoch": 2.3614958448753463,
"grad_norm": 2.045001745223999,
"learning_rate": 8.595424806273491e-06,
"loss": 0.1908,
"step": 17050
},
{
"epoch": 2.3628808864265927,
"grad_norm": 2.6451058387756348,
"learning_rate": 8.576864182636537e-06,
"loss": 0.1899,
"step": 17060
},
{
"epoch": 2.3642659279778395,
"grad_norm": 2.57863187789917,
"learning_rate": 8.558303558999584e-06,
"loss": 0.1604,
"step": 17070
},
{
"epoch": 2.365650969529086,
"grad_norm": 4.005101203918457,
"learning_rate": 8.53974293536263e-06,
"loss": 0.1464,
"step": 17080
},
{
"epoch": 2.3670360110803323,
"grad_norm": 2.0819427967071533,
"learning_rate": 8.521182311725675e-06,
"loss": 0.1723,
"step": 17090
},
{
"epoch": 2.3684210526315788,
"grad_norm": 2.7164533138275146,
"learning_rate": 8.50262168808872e-06,
"loss": 0.2051,
"step": 17100
},
{
"epoch": 2.3698060941828256,
"grad_norm": 1.8587703704833984,
"learning_rate": 8.484061064451766e-06,
"loss": 0.1622,
"step": 17110
},
{
"epoch": 2.371191135734072,
"grad_norm": 1.5107576847076416,
"learning_rate": 8.465500440814813e-06,
"loss": 0.176,
"step": 17120
},
{
"epoch": 2.3725761772853184,
"grad_norm": 3.7486839294433594,
"learning_rate": 8.446939817177859e-06,
"loss": 0.1531,
"step": 17130
},
{
"epoch": 2.3739612188365653,
"grad_norm": 2.4600539207458496,
"learning_rate": 8.428379193540902e-06,
"loss": 0.2029,
"step": 17140
},
{
"epoch": 2.3753462603878117,
"grad_norm": 1.6657919883728027,
"learning_rate": 8.40981856990395e-06,
"loss": 0.163,
"step": 17150
},
{
"epoch": 2.376731301939058,
"grad_norm": 1.488139271736145,
"learning_rate": 8.391257946266995e-06,
"loss": 0.1732,
"step": 17160
},
{
"epoch": 2.3781163434903045,
"grad_norm": 1.7949665784835815,
"learning_rate": 8.37269732263004e-06,
"loss": 0.2044,
"step": 17170
},
{
"epoch": 2.3795013850415514,
"grad_norm": 4.909599781036377,
"learning_rate": 8.354136698993088e-06,
"loss": 0.1786,
"step": 17180
},
{
"epoch": 2.380886426592798,
"grad_norm": 1.3685661554336548,
"learning_rate": 8.335576075356132e-06,
"loss": 0.1948,
"step": 17190
},
{
"epoch": 2.3822714681440442,
"grad_norm": 1.8317877054214478,
"learning_rate": 8.317015451719179e-06,
"loss": 0.1766,
"step": 17200
},
{
"epoch": 2.3836565096952906,
"grad_norm": 2.7056987285614014,
"learning_rate": 8.298454828082224e-06,
"loss": 0.166,
"step": 17210
},
{
"epoch": 2.3850415512465375,
"grad_norm": 2.0532264709472656,
"learning_rate": 8.27989420444527e-06,
"loss": 0.1553,
"step": 17220
},
{
"epoch": 2.386426592797784,
"grad_norm": 1.6754969358444214,
"learning_rate": 8.261333580808315e-06,
"loss": 0.1811,
"step": 17230
},
{
"epoch": 2.3878116343490303,
"grad_norm": 1.6388112306594849,
"learning_rate": 8.242772957171363e-06,
"loss": 0.201,
"step": 17240
},
{
"epoch": 2.389196675900277,
"grad_norm": 1.9280608892440796,
"learning_rate": 8.224212333534406e-06,
"loss": 0.1714,
"step": 17250
},
{
"epoch": 2.3905817174515236,
"grad_norm": 1.5879631042480469,
"learning_rate": 8.205651709897454e-06,
"loss": 0.1941,
"step": 17260
},
{
"epoch": 2.39196675900277,
"grad_norm": 1.5063045024871826,
"learning_rate": 8.187091086260499e-06,
"loss": 0.1816,
"step": 17270
},
{
"epoch": 2.393351800554017,
"grad_norm": 1.6192227602005005,
"learning_rate": 8.168530462623545e-06,
"loss": 0.1703,
"step": 17280
},
{
"epoch": 2.3947368421052633,
"grad_norm": 1.5527336597442627,
"learning_rate": 8.149969838986592e-06,
"loss": 0.1936,
"step": 17290
},
{
"epoch": 2.3961218836565097,
"grad_norm": 3.69242000579834,
"learning_rate": 8.131409215349636e-06,
"loss": 0.1867,
"step": 17300
},
{
"epoch": 2.397506925207756,
"grad_norm": 2.34364914894104,
"learning_rate": 8.112848591712683e-06,
"loss": 0.1905,
"step": 17310
},
{
"epoch": 2.398891966759003,
"grad_norm": 1.7522242069244385,
"learning_rate": 8.094287968075728e-06,
"loss": 0.1719,
"step": 17320
},
{
"epoch": 2.4002770083102494,
"grad_norm": 1.8389091491699219,
"learning_rate": 8.075727344438774e-06,
"loss": 0.157,
"step": 17330
},
{
"epoch": 2.401662049861496,
"grad_norm": 1.5576094388961792,
"learning_rate": 8.05716672080182e-06,
"loss": 0.1705,
"step": 17340
},
{
"epoch": 2.403047091412742,
"grad_norm": 3.3434033393859863,
"learning_rate": 8.038606097164865e-06,
"loss": 0.2162,
"step": 17350
},
{
"epoch": 2.404432132963989,
"grad_norm": 1.8596067428588867,
"learning_rate": 8.02004547352791e-06,
"loss": 0.1546,
"step": 17360
},
{
"epoch": 2.4058171745152355,
"grad_norm": 1.6991015672683716,
"learning_rate": 8.001484849890957e-06,
"loss": 0.1709,
"step": 17370
},
{
"epoch": 2.407202216066482,
"grad_norm": 1.4487606287002563,
"learning_rate": 7.982924226254003e-06,
"loss": 0.1858,
"step": 17380
},
{
"epoch": 2.4085872576177287,
"grad_norm": 1.7766512632369995,
"learning_rate": 7.964363602617048e-06,
"loss": 0.1825,
"step": 17390
},
{
"epoch": 2.409972299168975,
"grad_norm": 2.881904363632202,
"learning_rate": 7.945802978980094e-06,
"loss": 0.2057,
"step": 17400
},
{
"epoch": 2.4113573407202216,
"grad_norm": 1.7567830085754395,
"learning_rate": 7.92724235534314e-06,
"loss": 0.1728,
"step": 17410
},
{
"epoch": 2.412742382271468,
"grad_norm": 2.2497482299804688,
"learning_rate": 7.908681731706187e-06,
"loss": 0.1695,
"step": 17420
},
{
"epoch": 2.414127423822715,
"grad_norm": 1.412612795829773,
"learning_rate": 7.890121108069232e-06,
"loss": 0.1562,
"step": 17430
},
{
"epoch": 2.4155124653739612,
"grad_norm": 2.2560765743255615,
"learning_rate": 7.871560484432278e-06,
"loss": 0.1865,
"step": 17440
},
{
"epoch": 2.4168975069252077,
"grad_norm": 4.349898338317871,
"learning_rate": 7.852999860795323e-06,
"loss": 0.1842,
"step": 17450
},
{
"epoch": 2.418282548476454,
"grad_norm": 1.4243489503860474,
"learning_rate": 7.834439237158369e-06,
"loss": 0.1711,
"step": 17460
},
{
"epoch": 2.419667590027701,
"grad_norm": 1.888654351234436,
"learning_rate": 7.815878613521414e-06,
"loss": 0.2135,
"step": 17470
},
{
"epoch": 2.4210526315789473,
"grad_norm": 2.302077293395996,
"learning_rate": 7.797317989884461e-06,
"loss": 0.1969,
"step": 17480
},
{
"epoch": 2.4224376731301938,
"grad_norm": 1.739283800125122,
"learning_rate": 7.778757366247507e-06,
"loss": 0.1409,
"step": 17490
},
{
"epoch": 2.4238227146814406,
"grad_norm": 2.2265708446502686,
"learning_rate": 7.760196742610552e-06,
"loss": 0.1555,
"step": 17500
},
{
"epoch": 2.425207756232687,
"grad_norm": 3.4545814990997314,
"learning_rate": 7.741636118973598e-06,
"loss": 0.1922,
"step": 17510
},
{
"epoch": 2.4265927977839334,
"grad_norm": 1.799225091934204,
"learning_rate": 7.723075495336643e-06,
"loss": 0.1541,
"step": 17520
},
{
"epoch": 2.4279778393351803,
"grad_norm": 3.538924217224121,
"learning_rate": 7.70451487169969e-06,
"loss": 0.1595,
"step": 17530
},
{
"epoch": 2.4293628808864267,
"grad_norm": 3.1783182621002197,
"learning_rate": 7.685954248062736e-06,
"loss": 0.1915,
"step": 17540
},
{
"epoch": 2.430747922437673,
"grad_norm": 1.569459080696106,
"learning_rate": 7.667393624425782e-06,
"loss": 0.1822,
"step": 17550
},
{
"epoch": 2.4321329639889195,
"grad_norm": 1.8626432418823242,
"learning_rate": 7.648833000788827e-06,
"loss": 0.1566,
"step": 17560
},
{
"epoch": 2.4335180055401664,
"grad_norm": 1.8595503568649292,
"learning_rate": 7.630272377151873e-06,
"loss": 0.1844,
"step": 17570
},
{
"epoch": 2.434903047091413,
"grad_norm": 1.743048906326294,
"learning_rate": 7.611711753514919e-06,
"loss": 0.1704,
"step": 17580
},
{
"epoch": 2.436288088642659,
"grad_norm": 2.213383674621582,
"learning_rate": 7.5931511298779645e-06,
"loss": 0.1702,
"step": 17590
},
{
"epoch": 2.4376731301939056,
"grad_norm": 1.8055698871612549,
"learning_rate": 7.574590506241011e-06,
"loss": 0.1555,
"step": 17600
},
{
"epoch": 2.4390581717451525,
"grad_norm": 2.1330082416534424,
"learning_rate": 7.5560298826040555e-06,
"loss": 0.2097,
"step": 17610
},
{
"epoch": 2.440443213296399,
"grad_norm": 2.3374805450439453,
"learning_rate": 7.537469258967102e-06,
"loss": 0.1959,
"step": 17620
},
{
"epoch": 2.4418282548476453,
"grad_norm": 1.8412171602249146,
"learning_rate": 7.518908635330147e-06,
"loss": 0.1711,
"step": 17630
},
{
"epoch": 2.443213296398892,
"grad_norm": 1.8535511493682861,
"learning_rate": 7.500348011693194e-06,
"loss": 0.1739,
"step": 17640
},
{
"epoch": 2.4445983379501386,
"grad_norm": 1.7968270778656006,
"learning_rate": 7.48178738805624e-06,
"loss": 0.1841,
"step": 17650
},
{
"epoch": 2.445983379501385,
"grad_norm": 2.1843976974487305,
"learning_rate": 7.463226764419285e-06,
"loss": 0.142,
"step": 17660
},
{
"epoch": 2.4473684210526314,
"grad_norm": 1.7341108322143555,
"learning_rate": 7.44466614078233e-06,
"loss": 0.1807,
"step": 17670
},
{
"epoch": 2.4487534626038783,
"grad_norm": 1.8200232982635498,
"learning_rate": 7.426105517145377e-06,
"loss": 0.1564,
"step": 17680
},
{
"epoch": 2.4501385041551247,
"grad_norm": 3.129906415939331,
"learning_rate": 7.407544893508423e-06,
"loss": 0.1846,
"step": 17690
},
{
"epoch": 2.451523545706371,
"grad_norm": 2.155862331390381,
"learning_rate": 7.3889842698714685e-06,
"loss": 0.1618,
"step": 17700
},
{
"epoch": 2.4529085872576175,
"grad_norm": 1.7917015552520752,
"learning_rate": 7.370423646234514e-06,
"loss": 0.229,
"step": 17710
},
{
"epoch": 2.4542936288088644,
"grad_norm": 1.904911756515503,
"learning_rate": 7.3518630225975595e-06,
"loss": 0.1847,
"step": 17720
},
{
"epoch": 2.455678670360111,
"grad_norm": 2.046907901763916,
"learning_rate": 7.333302398960606e-06,
"loss": 0.1729,
"step": 17730
},
{
"epoch": 2.457063711911357,
"grad_norm": 2.4984397888183594,
"learning_rate": 7.314741775323651e-06,
"loss": 0.1569,
"step": 17740
},
{
"epoch": 2.458448753462604,
"grad_norm": 1.768541693687439,
"learning_rate": 7.296181151686698e-06,
"loss": 0.1665,
"step": 17750
},
{
"epoch": 2.4598337950138505,
"grad_norm": 2.0088300704956055,
"learning_rate": 7.277620528049743e-06,
"loss": 0.1772,
"step": 17760
},
{
"epoch": 2.461218836565097,
"grad_norm": 2.9492692947387695,
"learning_rate": 7.259059904412789e-06,
"loss": 0.1923,
"step": 17770
},
{
"epoch": 2.4626038781163437,
"grad_norm": 1.7164802551269531,
"learning_rate": 7.240499280775834e-06,
"loss": 0.1693,
"step": 17780
},
{
"epoch": 2.46398891966759,
"grad_norm": 1.7039817571640015,
"learning_rate": 7.2219386571388806e-06,
"loss": 0.1357,
"step": 17790
},
{
"epoch": 2.4653739612188366,
"grad_norm": 1.6272010803222656,
"learning_rate": 7.203378033501927e-06,
"loss": 0.1675,
"step": 17800
},
{
"epoch": 2.466759002770083,
"grad_norm": 2.1668031215667725,
"learning_rate": 7.1848174098649724e-06,
"loss": 0.2196,
"step": 17810
},
{
"epoch": 2.46814404432133,
"grad_norm": 2.1896896362304688,
"learning_rate": 7.166256786228017e-06,
"loss": 0.1705,
"step": 17820
},
{
"epoch": 2.4695290858725762,
"grad_norm": 1.9577006101608276,
"learning_rate": 7.1476961625910634e-06,
"loss": 0.1603,
"step": 17830
},
{
"epoch": 2.4709141274238227,
"grad_norm": 4.705322742462158,
"learning_rate": 7.12913553895411e-06,
"loss": 0.2003,
"step": 17840
},
{
"epoch": 2.472299168975069,
"grad_norm": 1.6234703063964844,
"learning_rate": 7.110574915317155e-06,
"loss": 0.2046,
"step": 17850
},
{
"epoch": 2.473684210526316,
"grad_norm": 2.3687870502471924,
"learning_rate": 7.092014291680202e-06,
"loss": 0.18,
"step": 17860
},
{
"epoch": 2.4750692520775623,
"grad_norm": 1.580241322517395,
"learning_rate": 7.073453668043246e-06,
"loss": 0.1803,
"step": 17870
},
{
"epoch": 2.4764542936288088,
"grad_norm": 2.288207530975342,
"learning_rate": 7.054893044406293e-06,
"loss": 0.1567,
"step": 17880
},
{
"epoch": 2.4778393351800556,
"grad_norm": 2.1847081184387207,
"learning_rate": 7.036332420769338e-06,
"loss": 0.1676,
"step": 17890
},
{
"epoch": 2.479224376731302,
"grad_norm": 2.0806612968444824,
"learning_rate": 7.0177717971323845e-06,
"loss": 0.1605,
"step": 17900
},
{
"epoch": 2.4806094182825484,
"grad_norm": 1.6858736276626587,
"learning_rate": 6.999211173495431e-06,
"loss": 0.1498,
"step": 17910
},
{
"epoch": 2.481994459833795,
"grad_norm": 1.8091322183609009,
"learning_rate": 6.9806505498584755e-06,
"loss": 0.1508,
"step": 17920
},
{
"epoch": 2.4833795013850417,
"grad_norm": 2.947080135345459,
"learning_rate": 6.962089926221521e-06,
"loss": 0.1519,
"step": 17930
},
{
"epoch": 2.484764542936288,
"grad_norm": 1.562985897064209,
"learning_rate": 6.943529302584567e-06,
"loss": 0.1828,
"step": 17940
},
{
"epoch": 2.4861495844875345,
"grad_norm": 2.3566386699676514,
"learning_rate": 6.924968678947614e-06,
"loss": 0.1745,
"step": 17950
},
{
"epoch": 2.487534626038781,
"grad_norm": 2.319905996322632,
"learning_rate": 6.906408055310659e-06,
"loss": 0.1805,
"step": 17960
},
{
"epoch": 2.488919667590028,
"grad_norm": 4.170083999633789,
"learning_rate": 6.887847431673704e-06,
"loss": 0.1932,
"step": 17970
},
{
"epoch": 2.490304709141274,
"grad_norm": 3.8577587604522705,
"learning_rate": 6.86928680803675e-06,
"loss": 0.1939,
"step": 17980
},
{
"epoch": 2.4916897506925206,
"grad_norm": 1.916488528251648,
"learning_rate": 6.850726184399797e-06,
"loss": 0.1808,
"step": 17990
},
{
"epoch": 2.4930747922437675,
"grad_norm": 2.2478854656219482,
"learning_rate": 6.832165560762842e-06,
"loss": 0.1912,
"step": 18000
}
],
"logging_steps": 10,
"max_steps": 21660,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.194381944570511e+18,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}