mcqa_v2 / trainer_state.json
carloscc10's picture
Upload 12 files
2b7ceba verified
{
"best_global_step": 5852,
"best_metric": 0.848216712474823,
"best_model_checkpoint": "./arthur-ft/checkpoint-5852",
"epoch": 1.9999006600148612,
"eval_steps": 500,
"global_step": 11704,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0017086477443862974,
"grad_norm": 81.49402618408203,
"learning_rate": 1.278409090909091e-07,
"loss": 1.5968,
"step": 10
},
{
"epoch": 0.0034172954887725948,
"grad_norm": 27.926727294921875,
"learning_rate": 2.6988636363636366e-07,
"loss": 1.2647,
"step": 20
},
{
"epoch": 0.005125943233158892,
"grad_norm": 26.150636672973633,
"learning_rate": 4.119318181818182e-07,
"loss": 1.1425,
"step": 30
},
{
"epoch": 0.0068345909775451895,
"grad_norm": 32.89878463745117,
"learning_rate": 5.539772727272728e-07,
"loss": 1.1125,
"step": 40
},
{
"epoch": 0.008543238721931487,
"grad_norm": 31.276884078979492,
"learning_rate": 6.960227272727273e-07,
"loss": 1.1157,
"step": 50
},
{
"epoch": 0.010251886466317785,
"grad_norm": 22.583457946777344,
"learning_rate": 8.380681818181818e-07,
"loss": 1.1315,
"step": 60
},
{
"epoch": 0.011960534210704083,
"grad_norm": 26.30866813659668,
"learning_rate": 9.801136363636364e-07,
"loss": 1.107,
"step": 70
},
{
"epoch": 0.013669181955090379,
"grad_norm": 26.83280372619629,
"learning_rate": 1.1221590909090909e-06,
"loss": 1.1115,
"step": 80
},
{
"epoch": 0.015377829699476677,
"grad_norm": 25.484037399291992,
"learning_rate": 1.2642045454545456e-06,
"loss": 1.1214,
"step": 90
},
{
"epoch": 0.017086477443862973,
"grad_norm": 21.573806762695312,
"learning_rate": 1.40625e-06,
"loss": 1.0899,
"step": 100
},
{
"epoch": 0.018795125188249273,
"grad_norm": 31.41425323486328,
"learning_rate": 1.5482954545454546e-06,
"loss": 1.1174,
"step": 110
},
{
"epoch": 0.02050377293263557,
"grad_norm": 31.211849212646484,
"learning_rate": 1.6903409090909093e-06,
"loss": 1.1159,
"step": 120
},
{
"epoch": 0.022212420677021866,
"grad_norm": 21.36687660217285,
"learning_rate": 1.8323863636363638e-06,
"loss": 1.0619,
"step": 130
},
{
"epoch": 0.023921068421408165,
"grad_norm": 21.135522842407227,
"learning_rate": 1.9744318181818183e-06,
"loss": 1.0649,
"step": 140
},
{
"epoch": 0.025629716165794462,
"grad_norm": 15.732582092285156,
"learning_rate": 2.1164772727272728e-06,
"loss": 1.0778,
"step": 150
},
{
"epoch": 0.027338363910180758,
"grad_norm": 29.327178955078125,
"learning_rate": 2.2585227272727277e-06,
"loss": 1.0911,
"step": 160
},
{
"epoch": 0.029047011654567058,
"grad_norm": 18.275588989257812,
"learning_rate": 2.4005681818181818e-06,
"loss": 1.0608,
"step": 170
},
{
"epoch": 0.030755659398953354,
"grad_norm": 23.922975540161133,
"learning_rate": 2.5426136363636367e-06,
"loss": 1.0478,
"step": 180
},
{
"epoch": 0.03246430714333965,
"grad_norm": 20.124839782714844,
"learning_rate": 2.684659090909091e-06,
"loss": 1.0195,
"step": 190
},
{
"epoch": 0.03417295488772595,
"grad_norm": 17.850400924682617,
"learning_rate": 2.8267045454545457e-06,
"loss": 1.0701,
"step": 200
},
{
"epoch": 0.03588160263211224,
"grad_norm": 21.76569175720215,
"learning_rate": 2.96875e-06,
"loss": 1.0621,
"step": 210
},
{
"epoch": 0.037590250376498546,
"grad_norm": 18.49285125732422,
"learning_rate": 3.110795454545455e-06,
"loss": 1.0733,
"step": 220
},
{
"epoch": 0.03929889812088484,
"grad_norm": 20.746557235717773,
"learning_rate": 3.252840909090909e-06,
"loss": 1.0711,
"step": 230
},
{
"epoch": 0.04100754586527114,
"grad_norm": 22.905445098876953,
"learning_rate": 3.3948863636363636e-06,
"loss": 1.0938,
"step": 240
},
{
"epoch": 0.042716193609657435,
"grad_norm": 21.94207763671875,
"learning_rate": 3.5369318181818186e-06,
"loss": 1.0307,
"step": 250
},
{
"epoch": 0.04442484135404373,
"grad_norm": 19.50674819946289,
"learning_rate": 3.678977272727273e-06,
"loss": 1.0386,
"step": 260
},
{
"epoch": 0.04613348909843003,
"grad_norm": 33.76179504394531,
"learning_rate": 3.821022727272727e-06,
"loss": 1.024,
"step": 270
},
{
"epoch": 0.04784213684281633,
"grad_norm": 23.91062355041504,
"learning_rate": 3.963068181818182e-06,
"loss": 1.0586,
"step": 280
},
{
"epoch": 0.04955078458720263,
"grad_norm": 20.59576988220215,
"learning_rate": 4.105113636363637e-06,
"loss": 1.0404,
"step": 290
},
{
"epoch": 0.051259432331588924,
"grad_norm": 20.871490478515625,
"learning_rate": 4.247159090909092e-06,
"loss": 1.0237,
"step": 300
},
{
"epoch": 0.05296808007597522,
"grad_norm": 20.965486526489258,
"learning_rate": 4.389204545454546e-06,
"loss": 0.9917,
"step": 310
},
{
"epoch": 0.054676727820361516,
"grad_norm": 25.50008201599121,
"learning_rate": 4.53125e-06,
"loss": 1.0394,
"step": 320
},
{
"epoch": 0.05638537556474781,
"grad_norm": 19.419902801513672,
"learning_rate": 4.673295454545455e-06,
"loss": 0.9999,
"step": 330
},
{
"epoch": 0.058094023309134116,
"grad_norm": 22.62456512451172,
"learning_rate": 4.815340909090909e-06,
"loss": 1.0649,
"step": 340
},
{
"epoch": 0.05980267105352041,
"grad_norm": 29.71078109741211,
"learning_rate": 4.957386363636364e-06,
"loss": 1.0064,
"step": 350
},
{
"epoch": 0.06151131879790671,
"grad_norm": 22.22879409790039,
"learning_rate": 4.9969168428470764e-06,
"loss": 1.06,
"step": 360
},
{
"epoch": 0.063219966542293,
"grad_norm": 21.7359619140625,
"learning_rate": 4.992512332628612e-06,
"loss": 1.043,
"step": 370
},
{
"epoch": 0.0649286142866793,
"grad_norm": 18.66118812561035,
"learning_rate": 4.988107822410148e-06,
"loss": 0.9956,
"step": 380
},
{
"epoch": 0.0666372620310656,
"grad_norm": 18.667190551757812,
"learning_rate": 4.983703312191685e-06,
"loss": 1.0341,
"step": 390
},
{
"epoch": 0.0683459097754519,
"grad_norm": 21.578454971313477,
"learning_rate": 4.979298801973221e-06,
"loss": 1.0265,
"step": 400
},
{
"epoch": 0.07005455751983819,
"grad_norm": 22.787809371948242,
"learning_rate": 4.974894291754757e-06,
"loss": 1.0062,
"step": 410
},
{
"epoch": 0.07176320526422449,
"grad_norm": 15.971848487854004,
"learning_rate": 4.970489781536293e-06,
"loss": 0.9964,
"step": 420
},
{
"epoch": 0.0734718530086108,
"grad_norm": 19.03502082824707,
"learning_rate": 4.96608527131783e-06,
"loss": 1.0245,
"step": 430
},
{
"epoch": 0.07518050075299709,
"grad_norm": 19.77265167236328,
"learning_rate": 4.9616807610993666e-06,
"loss": 1.0321,
"step": 440
},
{
"epoch": 0.07688914849738339,
"grad_norm": 26.002628326416016,
"learning_rate": 4.957276250880902e-06,
"loss": 1.024,
"step": 450
},
{
"epoch": 0.07859779624176969,
"grad_norm": 20.502666473388672,
"learning_rate": 4.952871740662439e-06,
"loss": 1.02,
"step": 460
},
{
"epoch": 0.08030644398615598,
"grad_norm": 25.582834243774414,
"learning_rate": 4.948467230443975e-06,
"loss": 0.9959,
"step": 470
},
{
"epoch": 0.08201509173054228,
"grad_norm": 23.496889114379883,
"learning_rate": 4.944062720225512e-06,
"loss": 1.0149,
"step": 480
},
{
"epoch": 0.08372373947492857,
"grad_norm": 19.036056518554688,
"learning_rate": 4.9396582100070475e-06,
"loss": 1.0096,
"step": 490
},
{
"epoch": 0.08543238721931487,
"grad_norm": 17.741846084594727,
"learning_rate": 4.935253699788584e-06,
"loss": 1.0365,
"step": 500
},
{
"epoch": 0.08714103496370117,
"grad_norm": 17.736528396606445,
"learning_rate": 4.93084918957012e-06,
"loss": 0.9791,
"step": 510
},
{
"epoch": 0.08884968270808746,
"grad_norm": 15.833001136779785,
"learning_rate": 4.926444679351657e-06,
"loss": 1.0195,
"step": 520
},
{
"epoch": 0.09055833045247376,
"grad_norm": 21.06598663330078,
"learning_rate": 4.9220401691331925e-06,
"loss": 1.0199,
"step": 530
},
{
"epoch": 0.09226697819686006,
"grad_norm": 20.22286605834961,
"learning_rate": 4.917635658914729e-06,
"loss": 1.0549,
"step": 540
},
{
"epoch": 0.09397562594124637,
"grad_norm": 18.53827476501465,
"learning_rate": 4.913231148696265e-06,
"loss": 1.0115,
"step": 550
},
{
"epoch": 0.09568427368563266,
"grad_norm": 17.737276077270508,
"learning_rate": 4.908826638477802e-06,
"loss": 0.9855,
"step": 560
},
{
"epoch": 0.09739292143001896,
"grad_norm": 16.152812957763672,
"learning_rate": 4.9044221282593376e-06,
"loss": 1.0029,
"step": 570
},
{
"epoch": 0.09910156917440525,
"grad_norm": 22.577655792236328,
"learning_rate": 4.900017618040874e-06,
"loss": 0.9897,
"step": 580
},
{
"epoch": 0.10081021691879155,
"grad_norm": 19.24541664123535,
"learning_rate": 4.895613107822411e-06,
"loss": 0.9428,
"step": 590
},
{
"epoch": 0.10251886466317785,
"grad_norm": 14.999211311340332,
"learning_rate": 4.891208597603947e-06,
"loss": 1.0375,
"step": 600
},
{
"epoch": 0.10422751240756414,
"grad_norm": 17.597455978393555,
"learning_rate": 4.8868040873854835e-06,
"loss": 0.9238,
"step": 610
},
{
"epoch": 0.10593616015195044,
"grad_norm": 14.606751441955566,
"learning_rate": 4.882399577167019e-06,
"loss": 1.0094,
"step": 620
},
{
"epoch": 0.10764480789633674,
"grad_norm": 19.107078552246094,
"learning_rate": 4.877995066948556e-06,
"loss": 1.0012,
"step": 630
},
{
"epoch": 0.10935345564072303,
"grad_norm": 22.561248779296875,
"learning_rate": 4.873590556730092e-06,
"loss": 0.9683,
"step": 640
},
{
"epoch": 0.11106210338510933,
"grad_norm": 20.76687240600586,
"learning_rate": 4.869186046511628e-06,
"loss": 0.9516,
"step": 650
},
{
"epoch": 0.11277075112949562,
"grad_norm": 18.26988410949707,
"learning_rate": 4.864781536293164e-06,
"loss": 1.003,
"step": 660
},
{
"epoch": 0.11447939887388194,
"grad_norm": 24.70865821838379,
"learning_rate": 4.860377026074701e-06,
"loss": 0.9629,
"step": 670
},
{
"epoch": 0.11618804661826823,
"grad_norm": 15.833657264709473,
"learning_rate": 4.855972515856237e-06,
"loss": 0.9842,
"step": 680
},
{
"epoch": 0.11789669436265453,
"grad_norm": 23.024721145629883,
"learning_rate": 4.851568005637774e-06,
"loss": 0.9945,
"step": 690
},
{
"epoch": 0.11960534210704082,
"grad_norm": 21.521650314331055,
"learning_rate": 4.8471634954193094e-06,
"loss": 0.9154,
"step": 700
},
{
"epoch": 0.12131398985142712,
"grad_norm": 16.77184295654297,
"learning_rate": 4.842758985200846e-06,
"loss": 0.9333,
"step": 710
},
{
"epoch": 0.12302263759581342,
"grad_norm": 18.226619720458984,
"learning_rate": 4.838354474982383e-06,
"loss": 1.003,
"step": 720
},
{
"epoch": 0.12473128534019971,
"grad_norm": 18.95140266418457,
"learning_rate": 4.833949964763919e-06,
"loss": 0.9384,
"step": 730
},
{
"epoch": 0.126439933084586,
"grad_norm": 21.0819149017334,
"learning_rate": 4.829545454545455e-06,
"loss": 1.0182,
"step": 740
},
{
"epoch": 0.12814858082897232,
"grad_norm": 20.32185935974121,
"learning_rate": 4.825140944326991e-06,
"loss": 0.9399,
"step": 750
},
{
"epoch": 0.1298572285733586,
"grad_norm": 21.246639251708984,
"learning_rate": 4.820736434108528e-06,
"loss": 0.9671,
"step": 760
},
{
"epoch": 0.1315658763177449,
"grad_norm": 17.076871871948242,
"learning_rate": 4.816331923890064e-06,
"loss": 0.9338,
"step": 770
},
{
"epoch": 0.1332745240621312,
"grad_norm": 17.45732879638672,
"learning_rate": 4.8119274136715996e-06,
"loss": 0.9071,
"step": 780
},
{
"epoch": 0.1349831718065175,
"grad_norm": 15.503561973571777,
"learning_rate": 4.807522903453136e-06,
"loss": 1.0028,
"step": 790
},
{
"epoch": 0.1366918195509038,
"grad_norm": 20.221580505371094,
"learning_rate": 4.803118393234673e-06,
"loss": 0.9703,
"step": 800
},
{
"epoch": 0.1384004672952901,
"grad_norm": 16.32524299621582,
"learning_rate": 4.798713883016209e-06,
"loss": 0.9856,
"step": 810
},
{
"epoch": 0.14010911503967638,
"grad_norm": 19.584348678588867,
"learning_rate": 4.7943093727977455e-06,
"loss": 0.9418,
"step": 820
},
{
"epoch": 0.1418177627840627,
"grad_norm": 23.859182357788086,
"learning_rate": 4.789904862579281e-06,
"loss": 0.9651,
"step": 830
},
{
"epoch": 0.14352641052844897,
"grad_norm": 20.757596969604492,
"learning_rate": 4.785500352360818e-06,
"loss": 0.9272,
"step": 840
},
{
"epoch": 0.14523505827283528,
"grad_norm": 19.72559928894043,
"learning_rate": 4.781095842142354e-06,
"loss": 0.9644,
"step": 850
},
{
"epoch": 0.1469437060172216,
"grad_norm": 20.055456161499023,
"learning_rate": 4.7766913319238905e-06,
"loss": 0.9819,
"step": 860
},
{
"epoch": 0.14865235376160787,
"grad_norm": 18.991012573242188,
"learning_rate": 4.772286821705427e-06,
"loss": 0.9234,
"step": 870
},
{
"epoch": 0.15036100150599419,
"grad_norm": 17.789796829223633,
"learning_rate": 4.767882311486963e-06,
"loss": 1.0245,
"step": 880
},
{
"epoch": 0.15206964925038047,
"grad_norm": 19.364513397216797,
"learning_rate": 4.7634778012685e-06,
"loss": 0.9382,
"step": 890
},
{
"epoch": 0.15377829699476678,
"grad_norm": 18.194772720336914,
"learning_rate": 4.759073291050036e-06,
"loss": 0.9533,
"step": 900
},
{
"epoch": 0.15548694473915306,
"grad_norm": 15.367209434509277,
"learning_rate": 4.7546687808315714e-06,
"loss": 0.9901,
"step": 910
},
{
"epoch": 0.15719559248353937,
"grad_norm": 26.23330307006836,
"learning_rate": 4.750264270613108e-06,
"loss": 0.8894,
"step": 920
},
{
"epoch": 0.15890424022792565,
"grad_norm": 20.43960189819336,
"learning_rate": 4.745859760394644e-06,
"loss": 0.9469,
"step": 930
},
{
"epoch": 0.16061288797231196,
"grad_norm": 17.476476669311523,
"learning_rate": 4.741455250176181e-06,
"loss": 0.9681,
"step": 940
},
{
"epoch": 0.16232153571669825,
"grad_norm": 18.390302658081055,
"learning_rate": 4.737050739957717e-06,
"loss": 0.9627,
"step": 950
},
{
"epoch": 0.16403018346108456,
"grad_norm": 26.21846580505371,
"learning_rate": 4.732646229739253e-06,
"loss": 0.9453,
"step": 960
},
{
"epoch": 0.16573883120547084,
"grad_norm": 17.23887062072754,
"learning_rate": 4.72824171952079e-06,
"loss": 0.9315,
"step": 970
},
{
"epoch": 0.16744747894985715,
"grad_norm": 15.847450256347656,
"learning_rate": 4.723837209302326e-06,
"loss": 0.9448,
"step": 980
},
{
"epoch": 0.16915612669424346,
"grad_norm": 20.83458709716797,
"learning_rate": 4.719432699083862e-06,
"loss": 0.9788,
"step": 990
},
{
"epoch": 0.17086477443862974,
"grad_norm": 28.041086196899414,
"learning_rate": 4.715028188865399e-06,
"loss": 0.9217,
"step": 1000
},
{
"epoch": 0.17257342218301605,
"grad_norm": 21.50284767150879,
"learning_rate": 4.710623678646935e-06,
"loss": 0.8928,
"step": 1010
},
{
"epoch": 0.17428206992740233,
"grad_norm": 21.39044761657715,
"learning_rate": 4.706219168428472e-06,
"loss": 0.9415,
"step": 1020
},
{
"epoch": 0.17599071767178864,
"grad_norm": 21.087949752807617,
"learning_rate": 4.7018146582100075e-06,
"loss": 0.9399,
"step": 1030
},
{
"epoch": 0.17769936541617493,
"grad_norm": 16.453859329223633,
"learning_rate": 4.697410147991543e-06,
"loss": 0.948,
"step": 1040
},
{
"epoch": 0.17940801316056124,
"grad_norm": 18.201675415039062,
"learning_rate": 4.69300563777308e-06,
"loss": 0.9403,
"step": 1050
},
{
"epoch": 0.18111666090494752,
"grad_norm": 18.971012115478516,
"learning_rate": 4.688601127554616e-06,
"loss": 0.9294,
"step": 1060
},
{
"epoch": 0.18282530864933383,
"grad_norm": 18.481828689575195,
"learning_rate": 4.6841966173361525e-06,
"loss": 0.8809,
"step": 1070
},
{
"epoch": 0.1845339563937201,
"grad_norm": 17.92839813232422,
"learning_rate": 4.679792107117689e-06,
"loss": 0.9608,
"step": 1080
},
{
"epoch": 0.18624260413810642,
"grad_norm": 21.6907958984375,
"learning_rate": 4.675387596899225e-06,
"loss": 0.9896,
"step": 1090
},
{
"epoch": 0.18795125188249273,
"grad_norm": 19.17830467224121,
"learning_rate": 4.670983086680762e-06,
"loss": 0.9319,
"step": 1100
},
{
"epoch": 0.189659899626879,
"grad_norm": 19.919885635375977,
"learning_rate": 4.666578576462298e-06,
"loss": 0.9509,
"step": 1110
},
{
"epoch": 0.19136854737126532,
"grad_norm": 13.461675643920898,
"learning_rate": 4.662174066243834e-06,
"loss": 0.9315,
"step": 1120
},
{
"epoch": 0.1930771951156516,
"grad_norm": 25.66329574584961,
"learning_rate": 4.65776955602537e-06,
"loss": 0.9203,
"step": 1130
},
{
"epoch": 0.19478584286003792,
"grad_norm": 20.48524284362793,
"learning_rate": 4.653365045806907e-06,
"loss": 0.8759,
"step": 1140
},
{
"epoch": 0.1964944906044242,
"grad_norm": 18.57932472229004,
"learning_rate": 4.6489605355884435e-06,
"loss": 0.9367,
"step": 1150
},
{
"epoch": 0.1982031383488105,
"grad_norm": 24.531593322753906,
"learning_rate": 4.644556025369979e-06,
"loss": 0.924,
"step": 1160
},
{
"epoch": 0.1999117860931968,
"grad_norm": 19.594648361206055,
"learning_rate": 4.640151515151515e-06,
"loss": 0.9189,
"step": 1170
},
{
"epoch": 0.2016204338375831,
"grad_norm": 18.946157455444336,
"learning_rate": 4.635747004933052e-06,
"loss": 0.9476,
"step": 1180
},
{
"epoch": 0.20332908158196938,
"grad_norm": 18.381322860717773,
"learning_rate": 4.631342494714588e-06,
"loss": 0.961,
"step": 1190
},
{
"epoch": 0.2050377293263557,
"grad_norm": 18.244287490844727,
"learning_rate": 4.626937984496124e-06,
"loss": 0.9345,
"step": 1200
},
{
"epoch": 0.20674637707074198,
"grad_norm": 21.273303985595703,
"learning_rate": 4.62253347427766e-06,
"loss": 0.8893,
"step": 1210
},
{
"epoch": 0.2084550248151283,
"grad_norm": 21.534873962402344,
"learning_rate": 4.618128964059197e-06,
"loss": 0.8948,
"step": 1220
},
{
"epoch": 0.2101636725595146,
"grad_norm": 20.035734176635742,
"learning_rate": 4.613724453840734e-06,
"loss": 0.9211,
"step": 1230
},
{
"epoch": 0.21187232030390088,
"grad_norm": 19.587982177734375,
"learning_rate": 4.6093199436222695e-06,
"loss": 0.9074,
"step": 1240
},
{
"epoch": 0.2135809680482872,
"grad_norm": 20.059412002563477,
"learning_rate": 4.604915433403806e-06,
"loss": 0.9389,
"step": 1250
},
{
"epoch": 0.21528961579267347,
"grad_norm": 23.202457427978516,
"learning_rate": 4.600510923185342e-06,
"loss": 0.9303,
"step": 1260
},
{
"epoch": 0.21699826353705978,
"grad_norm": 22.944717407226562,
"learning_rate": 4.596106412966879e-06,
"loss": 0.9118,
"step": 1270
},
{
"epoch": 0.21870691128144606,
"grad_norm": 19.934560775756836,
"learning_rate": 4.591701902748415e-06,
"loss": 0.9638,
"step": 1280
},
{
"epoch": 0.22041555902583237,
"grad_norm": 19.087709426879883,
"learning_rate": 4.587297392529951e-06,
"loss": 0.9519,
"step": 1290
},
{
"epoch": 0.22212420677021866,
"grad_norm": 17.25513458251953,
"learning_rate": 4.582892882311487e-06,
"loss": 0.8735,
"step": 1300
},
{
"epoch": 0.22383285451460497,
"grad_norm": 23.020050048828125,
"learning_rate": 4.578488372093024e-06,
"loss": 0.9319,
"step": 1310
},
{
"epoch": 0.22554150225899125,
"grad_norm": 18.893648147583008,
"learning_rate": 4.57408386187456e-06,
"loss": 0.9329,
"step": 1320
},
{
"epoch": 0.22725015000337756,
"grad_norm": 20.73868179321289,
"learning_rate": 4.569679351656096e-06,
"loss": 0.8715,
"step": 1330
},
{
"epoch": 0.22895879774776387,
"grad_norm": 25.549577713012695,
"learning_rate": 4.565274841437632e-06,
"loss": 0.9145,
"step": 1340
},
{
"epoch": 0.23066744549215015,
"grad_norm": 18.99001693725586,
"learning_rate": 4.560870331219169e-06,
"loss": 0.9251,
"step": 1350
},
{
"epoch": 0.23237609323653646,
"grad_norm": 19.704002380371094,
"learning_rate": 4.5564658210007055e-06,
"loss": 0.9342,
"step": 1360
},
{
"epoch": 0.23408474098092275,
"grad_norm": 20.581199645996094,
"learning_rate": 4.552061310782241e-06,
"loss": 0.9107,
"step": 1370
},
{
"epoch": 0.23579338872530906,
"grad_norm": 18.79061508178711,
"learning_rate": 4.547656800563778e-06,
"loss": 0.962,
"step": 1380
},
{
"epoch": 0.23750203646969534,
"grad_norm": 17.29990577697754,
"learning_rate": 4.543252290345314e-06,
"loss": 0.8641,
"step": 1390
},
{
"epoch": 0.23921068421408165,
"grad_norm": 16.5628719329834,
"learning_rate": 4.5388477801268506e-06,
"loss": 0.9237,
"step": 1400
},
{
"epoch": 0.24091933195846793,
"grad_norm": 17.68106460571289,
"learning_rate": 4.534443269908386e-06,
"loss": 0.9167,
"step": 1410
},
{
"epoch": 0.24262797970285424,
"grad_norm": 20.802289962768555,
"learning_rate": 4.530038759689923e-06,
"loss": 0.9236,
"step": 1420
},
{
"epoch": 0.24433662744724052,
"grad_norm": 15.484850883483887,
"learning_rate": 4.525634249471459e-06,
"loss": 0.9322,
"step": 1430
},
{
"epoch": 0.24604527519162683,
"grad_norm": 21.147815704345703,
"learning_rate": 4.521229739252996e-06,
"loss": 0.9034,
"step": 1440
},
{
"epoch": 0.24775392293601312,
"grad_norm": 20.891565322875977,
"learning_rate": 4.5168252290345315e-06,
"loss": 0.9118,
"step": 1450
},
{
"epoch": 0.24946257068039943,
"grad_norm": 20.994525909423828,
"learning_rate": 4.512420718816068e-06,
"loss": 0.8692,
"step": 1460
},
{
"epoch": 0.25117121842478574,
"grad_norm": 15.313887596130371,
"learning_rate": 4.508016208597604e-06,
"loss": 0.9845,
"step": 1470
},
{
"epoch": 0.252879866169172,
"grad_norm": 20.045129776000977,
"learning_rate": 4.503611698379141e-06,
"loss": 0.9341,
"step": 1480
},
{
"epoch": 0.2545885139135583,
"grad_norm": 20.646169662475586,
"learning_rate": 4.4992071881606765e-06,
"loss": 0.9434,
"step": 1490
},
{
"epoch": 0.25629716165794464,
"grad_norm": 21.185823440551758,
"learning_rate": 4.494802677942213e-06,
"loss": 0.9123,
"step": 1500
},
{
"epoch": 0.2580058094023309,
"grad_norm": 15.910945892333984,
"learning_rate": 4.49039816772375e-06,
"loss": 0.9393,
"step": 1510
},
{
"epoch": 0.2597144571467172,
"grad_norm": 19.833402633666992,
"learning_rate": 4.485993657505286e-06,
"loss": 0.9744,
"step": 1520
},
{
"epoch": 0.2614231048911035,
"grad_norm": 18.990707397460938,
"learning_rate": 4.481589147286822e-06,
"loss": 0.9286,
"step": 1530
},
{
"epoch": 0.2631317526354898,
"grad_norm": 21.067312240600586,
"learning_rate": 4.477184637068358e-06,
"loss": 0.9652,
"step": 1540
},
{
"epoch": 0.2648404003798761,
"grad_norm": 20.689836502075195,
"learning_rate": 4.472780126849895e-06,
"loss": 0.9158,
"step": 1550
},
{
"epoch": 0.2665490481242624,
"grad_norm": 17.695697784423828,
"learning_rate": 4.468375616631431e-06,
"loss": 0.8859,
"step": 1560
},
{
"epoch": 0.26825769586864867,
"grad_norm": 20.22654914855957,
"learning_rate": 4.463971106412967e-06,
"loss": 0.8948,
"step": 1570
},
{
"epoch": 0.269966343613035,
"grad_norm": 15.549092292785645,
"learning_rate": 4.459566596194503e-06,
"loss": 0.8615,
"step": 1580
},
{
"epoch": 0.2716749913574213,
"grad_norm": 18.86482810974121,
"learning_rate": 4.45516208597604e-06,
"loss": 0.9054,
"step": 1590
},
{
"epoch": 0.2733836391018076,
"grad_norm": 18.071102142333984,
"learning_rate": 4.450757575757576e-06,
"loss": 0.939,
"step": 1600
},
{
"epoch": 0.2750922868461939,
"grad_norm": 22.560697555541992,
"learning_rate": 4.4463530655391125e-06,
"loss": 0.8798,
"step": 1610
},
{
"epoch": 0.2768009345905802,
"grad_norm": 19.61587905883789,
"learning_rate": 4.441948555320648e-06,
"loss": 0.8923,
"step": 1620
},
{
"epoch": 0.2785095823349665,
"grad_norm": 13.9995698928833,
"learning_rate": 4.437544045102185e-06,
"loss": 0.9556,
"step": 1630
},
{
"epoch": 0.28021823007935276,
"grad_norm": 23.62803077697754,
"learning_rate": 4.433139534883722e-06,
"loss": 0.8756,
"step": 1640
},
{
"epoch": 0.2819268778237391,
"grad_norm": 19.477319717407227,
"learning_rate": 4.428735024665258e-06,
"loss": 0.8588,
"step": 1650
},
{
"epoch": 0.2836355255681254,
"grad_norm": 17.02006721496582,
"learning_rate": 4.424330514446794e-06,
"loss": 0.934,
"step": 1660
},
{
"epoch": 0.28534417331251166,
"grad_norm": 18.509023666381836,
"learning_rate": 4.41992600422833e-06,
"loss": 0.9693,
"step": 1670
},
{
"epoch": 0.28705282105689794,
"grad_norm": 16.825519561767578,
"learning_rate": 4.415521494009867e-06,
"loss": 0.8973,
"step": 1680
},
{
"epoch": 0.2887614688012843,
"grad_norm": 18.926586151123047,
"learning_rate": 4.411116983791403e-06,
"loss": 0.8644,
"step": 1690
},
{
"epoch": 0.29047011654567056,
"grad_norm": 20.28687286376953,
"learning_rate": 4.4067124735729385e-06,
"loss": 0.9245,
"step": 1700
},
{
"epoch": 0.29217876429005685,
"grad_norm": 23.774314880371094,
"learning_rate": 4.402307963354475e-06,
"loss": 0.8688,
"step": 1710
},
{
"epoch": 0.2938874120344432,
"grad_norm": 18.38115692138672,
"learning_rate": 4.397903453136012e-06,
"loss": 0.8836,
"step": 1720
},
{
"epoch": 0.29559605977882947,
"grad_norm": 17.962003707885742,
"learning_rate": 4.393498942917548e-06,
"loss": 0.8547,
"step": 1730
},
{
"epoch": 0.29730470752321575,
"grad_norm": 17.536418914794922,
"learning_rate": 4.389094432699084e-06,
"loss": 0.8899,
"step": 1740
},
{
"epoch": 0.29901335526760203,
"grad_norm": 24.884021759033203,
"learning_rate": 4.38468992248062e-06,
"loss": 0.8861,
"step": 1750
},
{
"epoch": 0.30072200301198837,
"grad_norm": 21.32032012939453,
"learning_rate": 4.380285412262157e-06,
"loss": 0.8905,
"step": 1760
},
{
"epoch": 0.30243065075637465,
"grad_norm": 17.606523513793945,
"learning_rate": 4.375880902043693e-06,
"loss": 0.8898,
"step": 1770
},
{
"epoch": 0.30413929850076094,
"grad_norm": 18.825279235839844,
"learning_rate": 4.3714763918252295e-06,
"loss": 0.8806,
"step": 1780
},
{
"epoch": 0.3058479462451472,
"grad_norm": 18.960371017456055,
"learning_rate": 4.367071881606766e-06,
"loss": 0.897,
"step": 1790
},
{
"epoch": 0.30755659398953356,
"grad_norm": 22.261259078979492,
"learning_rate": 4.362667371388302e-06,
"loss": 0.8931,
"step": 1800
},
{
"epoch": 0.30926524173391984,
"grad_norm": 23.404190063476562,
"learning_rate": 4.358262861169839e-06,
"loss": 0.8802,
"step": 1810
},
{
"epoch": 0.3109738894783061,
"grad_norm": 14.648833274841309,
"learning_rate": 4.3538583509513745e-06,
"loss": 0.9234,
"step": 1820
},
{
"epoch": 0.31268253722269246,
"grad_norm": 18.37412452697754,
"learning_rate": 4.34945384073291e-06,
"loss": 0.8852,
"step": 1830
},
{
"epoch": 0.31439118496707874,
"grad_norm": 24.400611877441406,
"learning_rate": 4.345049330514447e-06,
"loss": 0.8791,
"step": 1840
},
{
"epoch": 0.316099832711465,
"grad_norm": 17.905906677246094,
"learning_rate": 4.340644820295983e-06,
"loss": 0.874,
"step": 1850
},
{
"epoch": 0.3178084804558513,
"grad_norm": 16.834829330444336,
"learning_rate": 4.33624031007752e-06,
"loss": 0.907,
"step": 1860
},
{
"epoch": 0.31951712820023764,
"grad_norm": 18.529735565185547,
"learning_rate": 4.331835799859056e-06,
"loss": 0.8914,
"step": 1870
},
{
"epoch": 0.3212257759446239,
"grad_norm": 18.155649185180664,
"learning_rate": 4.327431289640592e-06,
"loss": 0.8678,
"step": 1880
},
{
"epoch": 0.3229344236890102,
"grad_norm": 15.488029479980469,
"learning_rate": 4.323026779422129e-06,
"loss": 0.8673,
"step": 1890
},
{
"epoch": 0.3246430714333965,
"grad_norm": 22.161739349365234,
"learning_rate": 4.318622269203665e-06,
"loss": 0.8082,
"step": 1900
},
{
"epoch": 0.32635171917778283,
"grad_norm": 21.01485252380371,
"learning_rate": 4.314217758985201e-06,
"loss": 0.8848,
"step": 1910
},
{
"epoch": 0.3280603669221691,
"grad_norm": 17.303821563720703,
"learning_rate": 4.309813248766738e-06,
"loss": 0.8917,
"step": 1920
},
{
"epoch": 0.3297690146665554,
"grad_norm": 27.273990631103516,
"learning_rate": 4.305408738548274e-06,
"loss": 0.8299,
"step": 1930
},
{
"epoch": 0.3314776624109417,
"grad_norm": 20.609886169433594,
"learning_rate": 4.3010042283298106e-06,
"loss": 0.8975,
"step": 1940
},
{
"epoch": 0.333186310155328,
"grad_norm": 21.860870361328125,
"learning_rate": 4.296599718111346e-06,
"loss": 0.8811,
"step": 1950
},
{
"epoch": 0.3348949578997143,
"grad_norm": 21.051359176635742,
"learning_rate": 4.292195207892882e-06,
"loss": 0.9229,
"step": 1960
},
{
"epoch": 0.3366036056441006,
"grad_norm": 22.4477596282959,
"learning_rate": 4.287790697674419e-06,
"loss": 0.865,
"step": 1970
},
{
"epoch": 0.3383122533884869,
"grad_norm": 20.99222755432129,
"learning_rate": 4.283386187455955e-06,
"loss": 0.8377,
"step": 1980
},
{
"epoch": 0.3400209011328732,
"grad_norm": 23.59244155883789,
"learning_rate": 4.2789816772374915e-06,
"loss": 0.8419,
"step": 1990
},
{
"epoch": 0.3417295488772595,
"grad_norm": 17.199111938476562,
"learning_rate": 4.274577167019028e-06,
"loss": 0.9104,
"step": 2000
},
{
"epoch": 0.34343819662164576,
"grad_norm": 23.190162658691406,
"learning_rate": 4.270172656800564e-06,
"loss": 0.859,
"step": 2010
},
{
"epoch": 0.3451468443660321,
"grad_norm": 22.3214168548584,
"learning_rate": 4.265768146582101e-06,
"loss": 0.898,
"step": 2020
},
{
"epoch": 0.3468554921104184,
"grad_norm": 17.06951141357422,
"learning_rate": 4.2613636363636365e-06,
"loss": 0.9043,
"step": 2030
},
{
"epoch": 0.34856413985480467,
"grad_norm": 22.465560913085938,
"learning_rate": 4.256959126145173e-06,
"loss": 0.8559,
"step": 2040
},
{
"epoch": 0.35027278759919095,
"grad_norm": 20.7056884765625,
"learning_rate": 4.25255461592671e-06,
"loss": 0.8545,
"step": 2050
},
{
"epoch": 0.3519814353435773,
"grad_norm": 18.856229782104492,
"learning_rate": 4.248150105708246e-06,
"loss": 0.8404,
"step": 2060
},
{
"epoch": 0.35369008308796357,
"grad_norm": 19.156654357910156,
"learning_rate": 4.2437455954897824e-06,
"loss": 0.9017,
"step": 2070
},
{
"epoch": 0.35539873083234985,
"grad_norm": 19.859079360961914,
"learning_rate": 4.239341085271318e-06,
"loss": 0.9067,
"step": 2080
},
{
"epoch": 0.3571073785767362,
"grad_norm": 20.216876983642578,
"learning_rate": 4.234936575052854e-06,
"loss": 0.8961,
"step": 2090
},
{
"epoch": 0.35881602632112247,
"grad_norm": 21.373823165893555,
"learning_rate": 4.230532064834391e-06,
"loss": 0.8803,
"step": 2100
},
{
"epoch": 0.36052467406550875,
"grad_norm": 21.679407119750977,
"learning_rate": 4.226127554615927e-06,
"loss": 0.857,
"step": 2110
},
{
"epoch": 0.36223332180989504,
"grad_norm": 21.290212631225586,
"learning_rate": 4.221723044397463e-06,
"loss": 0.9099,
"step": 2120
},
{
"epoch": 0.3639419695542814,
"grad_norm": 23.025487899780273,
"learning_rate": 4.217318534179e-06,
"loss": 0.8501,
"step": 2130
},
{
"epoch": 0.36565061729866766,
"grad_norm": 24.255035400390625,
"learning_rate": 4.212914023960536e-06,
"loss": 0.8415,
"step": 2140
},
{
"epoch": 0.36735926504305394,
"grad_norm": 19.89132308959961,
"learning_rate": 4.2085095137420726e-06,
"loss": 0.8498,
"step": 2150
},
{
"epoch": 0.3690679127874402,
"grad_norm": 19.75184440612793,
"learning_rate": 4.204105003523608e-06,
"loss": 0.8162,
"step": 2160
},
{
"epoch": 0.37077656053182656,
"grad_norm": 19.339553833007812,
"learning_rate": 4.199700493305145e-06,
"loss": 0.8784,
"step": 2170
},
{
"epoch": 0.37248520827621284,
"grad_norm": 15.743782997131348,
"learning_rate": 4.195295983086681e-06,
"loss": 0.8739,
"step": 2180
},
{
"epoch": 0.3741938560205991,
"grad_norm": 20.931917190551758,
"learning_rate": 4.190891472868218e-06,
"loss": 0.8697,
"step": 2190
},
{
"epoch": 0.37590250376498546,
"grad_norm": 21.439781188964844,
"learning_rate": 4.186486962649754e-06,
"loss": 0.9417,
"step": 2200
},
{
"epoch": 0.37761115150937175,
"grad_norm": 19.33049964904785,
"learning_rate": 4.18208245243129e-06,
"loss": 0.8648,
"step": 2210
},
{
"epoch": 0.379319799253758,
"grad_norm": 20.86115074157715,
"learning_rate": 4.177677942212826e-06,
"loss": 0.9008,
"step": 2220
},
{
"epoch": 0.3810284469981443,
"grad_norm": 21.383541107177734,
"learning_rate": 4.173273431994363e-06,
"loss": 0.8436,
"step": 2230
},
{
"epoch": 0.38273709474253065,
"grad_norm": 20.323444366455078,
"learning_rate": 4.1688689217758985e-06,
"loss": 0.8607,
"step": 2240
},
{
"epoch": 0.38444574248691693,
"grad_norm": 20.108402252197266,
"learning_rate": 4.164464411557435e-06,
"loss": 0.8718,
"step": 2250
},
{
"epoch": 0.3861543902313032,
"grad_norm": 27.39733123779297,
"learning_rate": 4.160059901338971e-06,
"loss": 0.801,
"step": 2260
},
{
"epoch": 0.3878630379756895,
"grad_norm": 19.76158332824707,
"learning_rate": 4.155655391120508e-06,
"loss": 0.8525,
"step": 2270
},
{
"epoch": 0.38957168572007583,
"grad_norm": 20.22632598876953,
"learning_rate": 4.1512508809020444e-06,
"loss": 0.8277,
"step": 2280
},
{
"epoch": 0.3912803334644621,
"grad_norm": 20.0892333984375,
"learning_rate": 4.14684637068358e-06,
"loss": 0.8352,
"step": 2290
},
{
"epoch": 0.3929889812088484,
"grad_norm": 18.96234893798828,
"learning_rate": 4.142441860465117e-06,
"loss": 0.8461,
"step": 2300
},
{
"epoch": 0.39469762895323474,
"grad_norm": 24.289127349853516,
"learning_rate": 4.138037350246653e-06,
"loss": 0.8953,
"step": 2310
},
{
"epoch": 0.396406276697621,
"grad_norm": 22.399789810180664,
"learning_rate": 4.1336328400281895e-06,
"loss": 0.8364,
"step": 2320
},
{
"epoch": 0.3981149244420073,
"grad_norm": 24.583871841430664,
"learning_rate": 4.129228329809726e-06,
"loss": 0.8271,
"step": 2330
},
{
"epoch": 0.3998235721863936,
"grad_norm": 25.536149978637695,
"learning_rate": 4.124823819591261e-06,
"loss": 0.8332,
"step": 2340
},
{
"epoch": 0.4015322199307799,
"grad_norm": 25.381229400634766,
"learning_rate": 4.120419309372798e-06,
"loss": 0.8155,
"step": 2350
},
{
"epoch": 0.4032408676751662,
"grad_norm": 20.306066513061523,
"learning_rate": 4.1160147991543346e-06,
"loss": 0.8213,
"step": 2360
},
{
"epoch": 0.4049495154195525,
"grad_norm": 22.400867462158203,
"learning_rate": 4.11161028893587e-06,
"loss": 0.8163,
"step": 2370
},
{
"epoch": 0.40665816316393877,
"grad_norm": 16.857330322265625,
"learning_rate": 4.107205778717407e-06,
"loss": 0.8498,
"step": 2380
},
{
"epoch": 0.4083668109083251,
"grad_norm": 23.580421447753906,
"learning_rate": 4.102801268498943e-06,
"loss": 0.8312,
"step": 2390
},
{
"epoch": 0.4100754586527114,
"grad_norm": 19.323286056518555,
"learning_rate": 4.09839675828048e-06,
"loss": 0.8104,
"step": 2400
},
{
"epoch": 0.41178410639709767,
"grad_norm": 20.80855941772461,
"learning_rate": 4.093992248062016e-06,
"loss": 0.8135,
"step": 2410
},
{
"epoch": 0.41349275414148395,
"grad_norm": 19.280595779418945,
"learning_rate": 4.089587737843552e-06,
"loss": 0.844,
"step": 2420
},
{
"epoch": 0.4152014018858703,
"grad_norm": 24.815204620361328,
"learning_rate": 4.085183227625089e-06,
"loss": 0.8324,
"step": 2430
},
{
"epoch": 0.4169100496302566,
"grad_norm": 19.941333770751953,
"learning_rate": 4.080778717406625e-06,
"loss": 0.8529,
"step": 2440
},
{
"epoch": 0.41861869737464286,
"grad_norm": 18.017372131347656,
"learning_rate": 4.076374207188161e-06,
"loss": 0.8462,
"step": 2450
},
{
"epoch": 0.4203273451190292,
"grad_norm": 15.000432014465332,
"learning_rate": 4.071969696969697e-06,
"loss": 0.8409,
"step": 2460
},
{
"epoch": 0.4220359928634155,
"grad_norm": 26.720317840576172,
"learning_rate": 4.067565186751233e-06,
"loss": 0.8698,
"step": 2470
},
{
"epoch": 0.42374464060780176,
"grad_norm": 21.395301818847656,
"learning_rate": 4.06316067653277e-06,
"loss": 0.7904,
"step": 2480
},
{
"epoch": 0.42545328835218804,
"grad_norm": 22.071170806884766,
"learning_rate": 4.058756166314306e-06,
"loss": 0.8307,
"step": 2490
},
{
"epoch": 0.4271619360965744,
"grad_norm": 18.912866592407227,
"learning_rate": 4.054351656095842e-06,
"loss": 0.8372,
"step": 2500
},
{
"epoch": 0.42887058384096066,
"grad_norm": 23.51670265197754,
"learning_rate": 4.049947145877379e-06,
"loss": 0.8141,
"step": 2510
},
{
"epoch": 0.43057923158534694,
"grad_norm": 17.042999267578125,
"learning_rate": 4.045542635658915e-06,
"loss": 0.8862,
"step": 2520
},
{
"epoch": 0.4322878793297332,
"grad_norm": 21.787776947021484,
"learning_rate": 4.0411381254404515e-06,
"loss": 0.8552,
"step": 2530
},
{
"epoch": 0.43399652707411956,
"grad_norm": 20.526792526245117,
"learning_rate": 4.036733615221987e-06,
"loss": 0.8179,
"step": 2540
},
{
"epoch": 0.43570517481850585,
"grad_norm": 25.407398223876953,
"learning_rate": 4.032329105003524e-06,
"loss": 0.8514,
"step": 2550
},
{
"epoch": 0.43741382256289213,
"grad_norm": 16.01190948486328,
"learning_rate": 4.027924594785061e-06,
"loss": 0.8364,
"step": 2560
},
{
"epoch": 0.43912247030727847,
"grad_norm": 20.050710678100586,
"learning_rate": 4.0235200845665965e-06,
"loss": 0.8362,
"step": 2570
},
{
"epoch": 0.44083111805166475,
"grad_norm": 20.279884338378906,
"learning_rate": 4.019115574348133e-06,
"loss": 0.8034,
"step": 2580
},
{
"epoch": 0.44253976579605103,
"grad_norm": 18.78345489501953,
"learning_rate": 4.014711064129669e-06,
"loss": 0.8336,
"step": 2590
},
{
"epoch": 0.4442484135404373,
"grad_norm": 24.339946746826172,
"learning_rate": 4.010306553911205e-06,
"loss": 0.8588,
"step": 2600
},
{
"epoch": 0.44595706128482365,
"grad_norm": 19.264131546020508,
"learning_rate": 4.005902043692742e-06,
"loss": 0.8536,
"step": 2610
},
{
"epoch": 0.44766570902920994,
"grad_norm": 18.921791076660156,
"learning_rate": 4.0014975334742774e-06,
"loss": 0.819,
"step": 2620
},
{
"epoch": 0.4493743567735962,
"grad_norm": 18.068126678466797,
"learning_rate": 3.997093023255814e-06,
"loss": 0.8061,
"step": 2630
},
{
"epoch": 0.4510830045179825,
"grad_norm": 17.197391510009766,
"learning_rate": 3.992688513037351e-06,
"loss": 0.7977,
"step": 2640
},
{
"epoch": 0.45279165226236884,
"grad_norm": 17.76527976989746,
"learning_rate": 3.988284002818887e-06,
"loss": 0.9012,
"step": 2650
},
{
"epoch": 0.4545003000067551,
"grad_norm": 19.648696899414062,
"learning_rate": 3.983879492600423e-06,
"loss": 0.8426,
"step": 2660
},
{
"epoch": 0.4562089477511414,
"grad_norm": 23.721616744995117,
"learning_rate": 3.979474982381959e-06,
"loss": 0.8055,
"step": 2670
},
{
"epoch": 0.45791759549552774,
"grad_norm": 19.17746353149414,
"learning_rate": 3.975070472163496e-06,
"loss": 0.8192,
"step": 2680
},
{
"epoch": 0.459626243239914,
"grad_norm": 19.428604125976562,
"learning_rate": 3.9706659619450326e-06,
"loss": 0.8168,
"step": 2690
},
{
"epoch": 0.4613348909843003,
"grad_norm": 20.59436798095703,
"learning_rate": 3.966261451726568e-06,
"loss": 0.873,
"step": 2700
},
{
"epoch": 0.4630435387286866,
"grad_norm": 22.71458625793457,
"learning_rate": 3.961856941508105e-06,
"loss": 0.8439,
"step": 2710
},
{
"epoch": 0.4647521864730729,
"grad_norm": 30.239309310913086,
"learning_rate": 3.957452431289641e-06,
"loss": 0.7587,
"step": 2720
},
{
"epoch": 0.4664608342174592,
"grad_norm": 18.89266014099121,
"learning_rate": 3.953047921071177e-06,
"loss": 0.8704,
"step": 2730
},
{
"epoch": 0.4681694819618455,
"grad_norm": 18.356983184814453,
"learning_rate": 3.9486434108527135e-06,
"loss": 0.8343,
"step": 2740
},
{
"epoch": 0.4698781297062318,
"grad_norm": 20.14874267578125,
"learning_rate": 3.944238900634249e-06,
"loss": 0.8119,
"step": 2750
},
{
"epoch": 0.4715867774506181,
"grad_norm": 28.85474967956543,
"learning_rate": 3.939834390415786e-06,
"loss": 0.8767,
"step": 2760
},
{
"epoch": 0.4732954251950044,
"grad_norm": 22.18447494506836,
"learning_rate": 3.935429880197323e-06,
"loss": 0.8175,
"step": 2770
},
{
"epoch": 0.4750040729393907,
"grad_norm": 22.46308135986328,
"learning_rate": 3.9310253699788585e-06,
"loss": 0.8281,
"step": 2780
},
{
"epoch": 0.47671272068377696,
"grad_norm": 18.005477905273438,
"learning_rate": 3.926620859760395e-06,
"loss": 0.7928,
"step": 2790
},
{
"epoch": 0.4784213684281633,
"grad_norm": 18.78510093688965,
"learning_rate": 3.922216349541931e-06,
"loss": 0.8042,
"step": 2800
},
{
"epoch": 0.4801300161725496,
"grad_norm": 22.980220794677734,
"learning_rate": 3.917811839323468e-06,
"loss": 0.8096,
"step": 2810
},
{
"epoch": 0.48183866391693586,
"grad_norm": 18.548603057861328,
"learning_rate": 3.913407329105004e-06,
"loss": 0.8696,
"step": 2820
},
{
"epoch": 0.4835473116613222,
"grad_norm": 23.86473846435547,
"learning_rate": 3.90900281888654e-06,
"loss": 0.8154,
"step": 2830
},
{
"epoch": 0.4852559594057085,
"grad_norm": 24.574298858642578,
"learning_rate": 3.904598308668077e-06,
"loss": 0.8047,
"step": 2840
},
{
"epoch": 0.48696460715009476,
"grad_norm": 21.888259887695312,
"learning_rate": 3.900193798449613e-06,
"loss": 0.8212,
"step": 2850
},
{
"epoch": 0.48867325489448105,
"grad_norm": 18.29496955871582,
"learning_rate": 3.895789288231149e-06,
"loss": 0.8578,
"step": 2860
},
{
"epoch": 0.4903819026388674,
"grad_norm": 19.80244255065918,
"learning_rate": 3.891384778012685e-06,
"loss": 0.8449,
"step": 2870
},
{
"epoch": 0.49209055038325367,
"grad_norm": 22.403602600097656,
"learning_rate": 3.886980267794221e-06,
"loss": 0.8207,
"step": 2880
},
{
"epoch": 0.49379919812763995,
"grad_norm": 25.105716705322266,
"learning_rate": 3.882575757575758e-06,
"loss": 0.8606,
"step": 2890
},
{
"epoch": 0.49550784587202623,
"grad_norm": 19.511430740356445,
"learning_rate": 3.878171247357294e-06,
"loss": 0.8417,
"step": 2900
},
{
"epoch": 0.49721649361641257,
"grad_norm": 20.566545486450195,
"learning_rate": 3.87376673713883e-06,
"loss": 0.7735,
"step": 2910
},
{
"epoch": 0.49892514136079885,
"grad_norm": 19.69638442993164,
"learning_rate": 3.869362226920367e-06,
"loss": 0.8195,
"step": 2920
},
{
"epoch": 0.5006337891051852,
"grad_norm": 19.20965576171875,
"learning_rate": 3.864957716701903e-06,
"loss": 0.8407,
"step": 2930
},
{
"epoch": 0.5023424368495715,
"grad_norm": 18.67803955078125,
"learning_rate": 3.86055320648344e-06,
"loss": 0.8401,
"step": 2940
},
{
"epoch": 0.5040510845939578,
"grad_norm": 19.72920036315918,
"learning_rate": 3.8561486962649755e-06,
"loss": 0.8176,
"step": 2950
},
{
"epoch": 0.505759732338344,
"grad_norm": 27.366355895996094,
"learning_rate": 3.851744186046512e-06,
"loss": 0.8252,
"step": 2960
},
{
"epoch": 0.5074683800827303,
"grad_norm": 24.130985260009766,
"learning_rate": 3.847339675828049e-06,
"loss": 0.8809,
"step": 2970
},
{
"epoch": 0.5091770278271166,
"grad_norm": 17.88861846923828,
"learning_rate": 3.842935165609585e-06,
"loss": 0.7728,
"step": 2980
},
{
"epoch": 0.5108856755715029,
"grad_norm": 22.26430892944336,
"learning_rate": 3.8385306553911205e-06,
"loss": 0.8516,
"step": 2990
},
{
"epoch": 0.5125943233158893,
"grad_norm": 20.059843063354492,
"learning_rate": 3.834126145172657e-06,
"loss": 0.8084,
"step": 3000
},
{
"epoch": 0.5143029710602756,
"grad_norm": 22.344680786132812,
"learning_rate": 3.829721634954193e-06,
"loss": 0.7728,
"step": 3010
},
{
"epoch": 0.5160116188046618,
"grad_norm": 18.06348991394043,
"learning_rate": 3.82531712473573e-06,
"loss": 0.8463,
"step": 3020
},
{
"epoch": 0.5177202665490481,
"grad_norm": 20.816757202148438,
"learning_rate": 3.820912614517266e-06,
"loss": 0.7904,
"step": 3030
},
{
"epoch": 0.5194289142934344,
"grad_norm": 24.29160499572754,
"learning_rate": 3.816508104298802e-06,
"loss": 0.8059,
"step": 3040
},
{
"epoch": 0.5211375620378207,
"grad_norm": 21.73212242126465,
"learning_rate": 3.8121035940803385e-06,
"loss": 0.7817,
"step": 3050
},
{
"epoch": 0.522846209782207,
"grad_norm": 20.20355224609375,
"learning_rate": 3.807699083861875e-06,
"loss": 0.7836,
"step": 3060
},
{
"epoch": 0.5245548575265934,
"grad_norm": 15.811525344848633,
"learning_rate": 3.803294573643411e-06,
"loss": 0.84,
"step": 3070
},
{
"epoch": 0.5262635052709796,
"grad_norm": 23.239578247070312,
"learning_rate": 3.7988900634249478e-06,
"loss": 0.795,
"step": 3080
},
{
"epoch": 0.5279721530153659,
"grad_norm": 18.50345802307129,
"learning_rate": 3.794485553206484e-06,
"loss": 0.8536,
"step": 3090
},
{
"epoch": 0.5296808007597522,
"grad_norm": 21.647409439086914,
"learning_rate": 3.7900810429880203e-06,
"loss": 0.8141,
"step": 3100
},
{
"epoch": 0.5313894485041385,
"grad_norm": 22.411800384521484,
"learning_rate": 3.7856765327695566e-06,
"loss": 0.7931,
"step": 3110
},
{
"epoch": 0.5330980962485248,
"grad_norm": 23.15050506591797,
"learning_rate": 3.7812720225510924e-06,
"loss": 0.7902,
"step": 3120
},
{
"epoch": 0.5348067439929111,
"grad_norm": 26.446077346801758,
"learning_rate": 3.7768675123326287e-06,
"loss": 0.8206,
"step": 3130
},
{
"epoch": 0.5365153917372973,
"grad_norm": 18.19157600402832,
"learning_rate": 3.772463002114165e-06,
"loss": 0.796,
"step": 3140
},
{
"epoch": 0.5382240394816837,
"grad_norm": 28.09468650817871,
"learning_rate": 3.768058491895701e-06,
"loss": 0.7592,
"step": 3150
},
{
"epoch": 0.53993268722607,
"grad_norm": 19.753379821777344,
"learning_rate": 3.763653981677238e-06,
"loss": 0.7917,
"step": 3160
},
{
"epoch": 0.5416413349704563,
"grad_norm": 22.52701759338379,
"learning_rate": 3.759249471458774e-06,
"loss": 0.8036,
"step": 3170
},
{
"epoch": 0.5433499827148426,
"grad_norm": 24.160633087158203,
"learning_rate": 3.7548449612403104e-06,
"loss": 0.8111,
"step": 3180
},
{
"epoch": 0.5450586304592289,
"grad_norm": 20.43000030517578,
"learning_rate": 3.7504404510218467e-06,
"loss": 0.8181,
"step": 3190
},
{
"epoch": 0.5467672782036151,
"grad_norm": 20.047271728515625,
"learning_rate": 3.746035940803383e-06,
"loss": 0.8229,
"step": 3200
},
{
"epoch": 0.5484759259480014,
"grad_norm": 20.642215728759766,
"learning_rate": 3.741631430584919e-06,
"loss": 0.8322,
"step": 3210
},
{
"epoch": 0.5501845736923878,
"grad_norm": 19.686071395874023,
"learning_rate": 3.737226920366456e-06,
"loss": 0.775,
"step": 3220
},
{
"epoch": 0.5518932214367741,
"grad_norm": 17.0440616607666,
"learning_rate": 3.732822410147992e-06,
"loss": 0.8128,
"step": 3230
},
{
"epoch": 0.5536018691811604,
"grad_norm": 20.75046730041504,
"learning_rate": 3.7284178999295284e-06,
"loss": 0.8061,
"step": 3240
},
{
"epoch": 0.5553105169255467,
"grad_norm": 23.867816925048828,
"learning_rate": 3.7240133897110643e-06,
"loss": 0.7951,
"step": 3250
},
{
"epoch": 0.557019164669933,
"grad_norm": 26.70461082458496,
"learning_rate": 3.7196088794926005e-06,
"loss": 0.8343,
"step": 3260
},
{
"epoch": 0.5587278124143192,
"grad_norm": 17.15665054321289,
"learning_rate": 3.715204369274137e-06,
"loss": 0.8434,
"step": 3270
},
{
"epoch": 0.5604364601587055,
"grad_norm": 23.122482299804688,
"learning_rate": 3.710799859055673e-06,
"loss": 0.799,
"step": 3280
},
{
"epoch": 0.5621451079030919,
"grad_norm": 18.05946922302246,
"learning_rate": 3.7063953488372093e-06,
"loss": 0.8326,
"step": 3290
},
{
"epoch": 0.5638537556474782,
"grad_norm": 21.532657623291016,
"learning_rate": 3.701990838618746e-06,
"loss": 0.7646,
"step": 3300
},
{
"epoch": 0.5655624033918645,
"grad_norm": 21.229511260986328,
"learning_rate": 3.6975863284002823e-06,
"loss": 0.7565,
"step": 3310
},
{
"epoch": 0.5672710511362508,
"grad_norm": 18.513898849487305,
"learning_rate": 3.6931818181818186e-06,
"loss": 0.8021,
"step": 3320
},
{
"epoch": 0.568979698880637,
"grad_norm": 20.819110870361328,
"learning_rate": 3.688777307963355e-06,
"loss": 0.8215,
"step": 3330
},
{
"epoch": 0.5706883466250233,
"grad_norm": 27.454303741455078,
"learning_rate": 3.684372797744891e-06,
"loss": 0.7404,
"step": 3340
},
{
"epoch": 0.5723969943694096,
"grad_norm": 20.618860244750977,
"learning_rate": 3.6799682875264273e-06,
"loss": 0.7855,
"step": 3350
},
{
"epoch": 0.5741056421137959,
"grad_norm": 21.150808334350586,
"learning_rate": 3.675563777307964e-06,
"loss": 0.7411,
"step": 3360
},
{
"epoch": 0.5758142898581823,
"grad_norm": 23.632627487182617,
"learning_rate": 3.6711592670895003e-06,
"loss": 0.7503,
"step": 3370
},
{
"epoch": 0.5775229376025686,
"grad_norm": 19.350055694580078,
"learning_rate": 3.666754756871036e-06,
"loss": 0.8135,
"step": 3380
},
{
"epoch": 0.5792315853469548,
"grad_norm": 19.341176986694336,
"learning_rate": 3.6623502466525724e-06,
"loss": 0.8338,
"step": 3390
},
{
"epoch": 0.5809402330913411,
"grad_norm": 24.91313362121582,
"learning_rate": 3.6579457364341087e-06,
"loss": 0.7763,
"step": 3400
},
{
"epoch": 0.5826488808357274,
"grad_norm": 23.72249412536621,
"learning_rate": 3.653541226215645e-06,
"loss": 0.7926,
"step": 3410
},
{
"epoch": 0.5843575285801137,
"grad_norm": 22.838260650634766,
"learning_rate": 3.649136715997181e-06,
"loss": 0.8036,
"step": 3420
},
{
"epoch": 0.5860661763245,
"grad_norm": 19.691679000854492,
"learning_rate": 3.6447322057787175e-06,
"loss": 0.8427,
"step": 3430
},
{
"epoch": 0.5877748240688864,
"grad_norm": 21.973587036132812,
"learning_rate": 3.640327695560254e-06,
"loss": 0.8159,
"step": 3440
},
{
"epoch": 0.5894834718132727,
"grad_norm": 21.701208114624023,
"learning_rate": 3.6359231853417904e-06,
"loss": 0.8004,
"step": 3450
},
{
"epoch": 0.5911921195576589,
"grad_norm": 21.209928512573242,
"learning_rate": 3.6315186751233267e-06,
"loss": 0.8038,
"step": 3460
},
{
"epoch": 0.5929007673020452,
"grad_norm": 19.597747802734375,
"learning_rate": 3.627114164904863e-06,
"loss": 0.7868,
"step": 3470
},
{
"epoch": 0.5946094150464315,
"grad_norm": 18.882831573486328,
"learning_rate": 3.6227096546863992e-06,
"loss": 0.7861,
"step": 3480
},
{
"epoch": 0.5963180627908178,
"grad_norm": 18.91342544555664,
"learning_rate": 3.6183051444679355e-06,
"loss": 0.8082,
"step": 3490
},
{
"epoch": 0.5980267105352041,
"grad_norm": 23.127704620361328,
"learning_rate": 3.613900634249472e-06,
"loss": 0.8093,
"step": 3500
},
{
"epoch": 0.5997353582795903,
"grad_norm": 23.314237594604492,
"learning_rate": 3.6094961240310076e-06,
"loss": 0.8148,
"step": 3510
},
{
"epoch": 0.6014440060239767,
"grad_norm": 19.88514518737793,
"learning_rate": 3.6050916138125443e-06,
"loss": 0.7951,
"step": 3520
},
{
"epoch": 0.603152653768363,
"grad_norm": 23.107532501220703,
"learning_rate": 3.6006871035940805e-06,
"loss": 0.7852,
"step": 3530
},
{
"epoch": 0.6048613015127493,
"grad_norm": 26.108352661132812,
"learning_rate": 3.596282593375617e-06,
"loss": 0.7846,
"step": 3540
},
{
"epoch": 0.6065699492571356,
"grad_norm": 21.61062240600586,
"learning_rate": 3.591878083157153e-06,
"loss": 0.8116,
"step": 3550
},
{
"epoch": 0.6082785970015219,
"grad_norm": 22.475379943847656,
"learning_rate": 3.5874735729386893e-06,
"loss": 0.8082,
"step": 3560
},
{
"epoch": 0.6099872447459082,
"grad_norm": 20.961181640625,
"learning_rate": 3.5830690627202256e-06,
"loss": 0.7747,
"step": 3570
},
{
"epoch": 0.6116958924902944,
"grad_norm": 23.609365463256836,
"learning_rate": 3.5786645525017623e-06,
"loss": 0.828,
"step": 3580
},
{
"epoch": 0.6134045402346808,
"grad_norm": 17.144989013671875,
"learning_rate": 3.5742600422832986e-06,
"loss": 0.8089,
"step": 3590
},
{
"epoch": 0.6151131879790671,
"grad_norm": 24.28973388671875,
"learning_rate": 3.569855532064835e-06,
"loss": 0.782,
"step": 3600
},
{
"epoch": 0.6168218357234534,
"grad_norm": 21.782333374023438,
"learning_rate": 3.565451021846371e-06,
"loss": 0.8252,
"step": 3610
},
{
"epoch": 0.6185304834678397,
"grad_norm": 18.921234130859375,
"learning_rate": 3.5610465116279074e-06,
"loss": 0.7856,
"step": 3620
},
{
"epoch": 0.620239131212226,
"grad_norm": 27.037317276000977,
"learning_rate": 3.5566420014094436e-06,
"loss": 0.7732,
"step": 3630
},
{
"epoch": 0.6219477789566122,
"grad_norm": 20.37610626220703,
"learning_rate": 3.5522374911909795e-06,
"loss": 0.8081,
"step": 3640
},
{
"epoch": 0.6236564267009985,
"grad_norm": 20.596923828125,
"learning_rate": 3.547832980972516e-06,
"loss": 0.8218,
"step": 3650
},
{
"epoch": 0.6253650744453849,
"grad_norm": 19.31607437133789,
"learning_rate": 3.5434284707540524e-06,
"loss": 0.8212,
"step": 3660
},
{
"epoch": 0.6270737221897712,
"grad_norm": 25.045026779174805,
"learning_rate": 3.5390239605355887e-06,
"loss": 0.8197,
"step": 3670
},
{
"epoch": 0.6287823699341575,
"grad_norm": 26.2932071685791,
"learning_rate": 3.534619450317125e-06,
"loss": 0.8084,
"step": 3680
},
{
"epoch": 0.6304910176785438,
"grad_norm": 22.81402587890625,
"learning_rate": 3.530214940098661e-06,
"loss": 0.7689,
"step": 3690
},
{
"epoch": 0.63219966542293,
"grad_norm": 19.472158432006836,
"learning_rate": 3.5258104298801975e-06,
"loss": 0.7875,
"step": 3700
},
{
"epoch": 0.6339083131673163,
"grad_norm": 18.043285369873047,
"learning_rate": 3.5214059196617337e-06,
"loss": 0.8188,
"step": 3710
},
{
"epoch": 0.6356169609117026,
"grad_norm": 29.622112274169922,
"learning_rate": 3.5170014094432704e-06,
"loss": 0.7512,
"step": 3720
},
{
"epoch": 0.6373256086560889,
"grad_norm": 20.153039932250977,
"learning_rate": 3.5125968992248067e-06,
"loss": 0.7823,
"step": 3730
},
{
"epoch": 0.6390342564004753,
"grad_norm": 23.100482940673828,
"learning_rate": 3.508192389006343e-06,
"loss": 0.8137,
"step": 3740
},
{
"epoch": 0.6407429041448616,
"grad_norm": 23.236019134521484,
"learning_rate": 3.5037878787878792e-06,
"loss": 0.7014,
"step": 3750
},
{
"epoch": 0.6424515518892479,
"grad_norm": 22.595932006835938,
"learning_rate": 3.4993833685694155e-06,
"loss": 0.7896,
"step": 3760
},
{
"epoch": 0.6441601996336341,
"grad_norm": 24.64199447631836,
"learning_rate": 3.4949788583509513e-06,
"loss": 0.7879,
"step": 3770
},
{
"epoch": 0.6458688473780204,
"grad_norm": 17.925630569458008,
"learning_rate": 3.4905743481324876e-06,
"loss": 0.8212,
"step": 3780
},
{
"epoch": 0.6475774951224067,
"grad_norm": 27.082433700561523,
"learning_rate": 3.4861698379140243e-06,
"loss": 0.8151,
"step": 3790
},
{
"epoch": 0.649286142866793,
"grad_norm": 19.66040802001953,
"learning_rate": 3.4817653276955606e-06,
"loss": 0.7707,
"step": 3800
},
{
"epoch": 0.6509947906111794,
"grad_norm": 22.2485408782959,
"learning_rate": 3.477360817477097e-06,
"loss": 0.742,
"step": 3810
},
{
"epoch": 0.6527034383555657,
"grad_norm": 20.138118743896484,
"learning_rate": 3.472956307258633e-06,
"loss": 0.8006,
"step": 3820
},
{
"epoch": 0.6544120860999519,
"grad_norm": 21.199825286865234,
"learning_rate": 3.4685517970401693e-06,
"loss": 0.7662,
"step": 3830
},
{
"epoch": 0.6561207338443382,
"grad_norm": 24.360260009765625,
"learning_rate": 3.4641472868217056e-06,
"loss": 0.8112,
"step": 3840
},
{
"epoch": 0.6578293815887245,
"grad_norm": 16.09538459777832,
"learning_rate": 3.4597427766032423e-06,
"loss": 0.7945,
"step": 3850
},
{
"epoch": 0.6595380293331108,
"grad_norm": 22.721424102783203,
"learning_rate": 3.4553382663847786e-06,
"loss": 0.8033,
"step": 3860
},
{
"epoch": 0.6612466770774971,
"grad_norm": 24.86945343017578,
"learning_rate": 3.450933756166315e-06,
"loss": 0.7616,
"step": 3870
},
{
"epoch": 0.6629553248218834,
"grad_norm": 23.66960906982422,
"learning_rate": 3.446529245947851e-06,
"loss": 0.739,
"step": 3880
},
{
"epoch": 0.6646639725662697,
"grad_norm": 26.404010772705078,
"learning_rate": 3.4421247357293874e-06,
"loss": 0.8027,
"step": 3890
},
{
"epoch": 0.666372620310656,
"grad_norm": 17.85309410095215,
"learning_rate": 3.437720225510923e-06,
"loss": 0.8071,
"step": 3900
},
{
"epoch": 0.6680812680550423,
"grad_norm": 21.82198143005371,
"learning_rate": 3.4333157152924595e-06,
"loss": 0.8042,
"step": 3910
},
{
"epoch": 0.6697899157994286,
"grad_norm": 23.275218963623047,
"learning_rate": 3.4289112050739957e-06,
"loss": 0.7481,
"step": 3920
},
{
"epoch": 0.6714985635438149,
"grad_norm": 21.871013641357422,
"learning_rate": 3.4245066948555324e-06,
"loss": 0.7892,
"step": 3930
},
{
"epoch": 0.6732072112882012,
"grad_norm": 21.8370418548584,
"learning_rate": 3.4201021846370687e-06,
"loss": 0.7582,
"step": 3940
},
{
"epoch": 0.6749158590325874,
"grad_norm": 23.338394165039062,
"learning_rate": 3.415697674418605e-06,
"loss": 0.7742,
"step": 3950
},
{
"epoch": 0.6766245067769738,
"grad_norm": 22.160715103149414,
"learning_rate": 3.4112931642001412e-06,
"loss": 0.7382,
"step": 3960
},
{
"epoch": 0.6783331545213601,
"grad_norm": 20.671384811401367,
"learning_rate": 3.4068886539816775e-06,
"loss": 0.7889,
"step": 3970
},
{
"epoch": 0.6800418022657464,
"grad_norm": 25.99142837524414,
"learning_rate": 3.4024841437632137e-06,
"loss": 0.7906,
"step": 3980
},
{
"epoch": 0.6817504500101327,
"grad_norm": 21.951120376586914,
"learning_rate": 3.3980796335447504e-06,
"loss": 0.7836,
"step": 3990
},
{
"epoch": 0.683459097754519,
"grad_norm": 19.033308029174805,
"learning_rate": 3.3936751233262867e-06,
"loss": 0.7602,
"step": 4000
},
{
"epoch": 0.6851677454989052,
"grad_norm": 23.86874008178711,
"learning_rate": 3.389270613107823e-06,
"loss": 0.7759,
"step": 4010
},
{
"epoch": 0.6868763932432915,
"grad_norm": 19.606098175048828,
"learning_rate": 3.3848661028893592e-06,
"loss": 0.7874,
"step": 4020
},
{
"epoch": 0.6885850409876779,
"grad_norm": 20.22423553466797,
"learning_rate": 3.380461592670895e-06,
"loss": 0.825,
"step": 4030
},
{
"epoch": 0.6902936887320642,
"grad_norm": 27.637001037597656,
"learning_rate": 3.3760570824524313e-06,
"loss": 0.771,
"step": 4040
},
{
"epoch": 0.6920023364764505,
"grad_norm": 18.97125244140625,
"learning_rate": 3.3716525722339676e-06,
"loss": 0.7249,
"step": 4050
},
{
"epoch": 0.6937109842208368,
"grad_norm": 22.724328994750977,
"learning_rate": 3.367248062015504e-06,
"loss": 0.746,
"step": 4060
},
{
"epoch": 0.695419631965223,
"grad_norm": 21.274978637695312,
"learning_rate": 3.3628435517970406e-06,
"loss": 0.7504,
"step": 4070
},
{
"epoch": 0.6971282797096093,
"grad_norm": 23.363569259643555,
"learning_rate": 3.358439041578577e-06,
"loss": 0.6809,
"step": 4080
},
{
"epoch": 0.6988369274539956,
"grad_norm": 27.47598648071289,
"learning_rate": 3.354034531360113e-06,
"loss": 0.755,
"step": 4090
},
{
"epoch": 0.7005455751983819,
"grad_norm": 23.85652732849121,
"learning_rate": 3.3496300211416494e-06,
"loss": 0.7601,
"step": 4100
},
{
"epoch": 0.7022542229427683,
"grad_norm": 18.246395111083984,
"learning_rate": 3.3452255109231856e-06,
"loss": 0.7201,
"step": 4110
},
{
"epoch": 0.7039628706871546,
"grad_norm": 22.6968936920166,
"learning_rate": 3.340821000704722e-06,
"loss": 0.772,
"step": 4120
},
{
"epoch": 0.7056715184315409,
"grad_norm": 16.688634872436523,
"learning_rate": 3.3364164904862586e-06,
"loss": 0.7743,
"step": 4130
},
{
"epoch": 0.7073801661759271,
"grad_norm": 22.384685516357422,
"learning_rate": 3.332011980267795e-06,
"loss": 0.7562,
"step": 4140
},
{
"epoch": 0.7090888139203134,
"grad_norm": 25.848621368408203,
"learning_rate": 3.327607470049331e-06,
"loss": 0.778,
"step": 4150
},
{
"epoch": 0.7107974616646997,
"grad_norm": 20.71343231201172,
"learning_rate": 3.323202959830867e-06,
"loss": 0.7714,
"step": 4160
},
{
"epoch": 0.712506109409086,
"grad_norm": 25.288433074951172,
"learning_rate": 3.318798449612403e-06,
"loss": 0.7812,
"step": 4170
},
{
"epoch": 0.7142147571534724,
"grad_norm": 25.958364486694336,
"learning_rate": 3.3143939393939395e-06,
"loss": 0.8008,
"step": 4180
},
{
"epoch": 0.7159234048978587,
"grad_norm": 23.568279266357422,
"learning_rate": 3.3099894291754757e-06,
"loss": 0.7468,
"step": 4190
},
{
"epoch": 0.7176320526422449,
"grad_norm": 25.222332000732422,
"learning_rate": 3.305584918957012e-06,
"loss": 0.7379,
"step": 4200
},
{
"epoch": 0.7193407003866312,
"grad_norm": 23.69734764099121,
"learning_rate": 3.3011804087385487e-06,
"loss": 0.7478,
"step": 4210
},
{
"epoch": 0.7210493481310175,
"grad_norm": 18.56196403503418,
"learning_rate": 3.296775898520085e-06,
"loss": 0.7341,
"step": 4220
},
{
"epoch": 0.7227579958754038,
"grad_norm": 28.462255477905273,
"learning_rate": 3.2923713883016212e-06,
"loss": 0.7084,
"step": 4230
},
{
"epoch": 0.7244666436197901,
"grad_norm": 23.669126510620117,
"learning_rate": 3.2879668780831575e-06,
"loss": 0.741,
"step": 4240
},
{
"epoch": 0.7261752913641765,
"grad_norm": 22.7609920501709,
"learning_rate": 3.2835623678646938e-06,
"loss": 0.7507,
"step": 4250
},
{
"epoch": 0.7278839391085627,
"grad_norm": 21.962385177612305,
"learning_rate": 3.27915785764623e-06,
"loss": 0.7521,
"step": 4260
},
{
"epoch": 0.729592586852949,
"grad_norm": 23.406116485595703,
"learning_rate": 3.2747533474277667e-06,
"loss": 0.7374,
"step": 4270
},
{
"epoch": 0.7313012345973353,
"grad_norm": 25.467397689819336,
"learning_rate": 3.270348837209303e-06,
"loss": 0.7894,
"step": 4280
},
{
"epoch": 0.7330098823417216,
"grad_norm": 21.29004669189453,
"learning_rate": 3.265944326990839e-06,
"loss": 0.7763,
"step": 4290
},
{
"epoch": 0.7347185300861079,
"grad_norm": 26.53734588623047,
"learning_rate": 3.261539816772375e-06,
"loss": 0.7704,
"step": 4300
},
{
"epoch": 0.7364271778304942,
"grad_norm": 26.881288528442383,
"learning_rate": 3.2571353065539113e-06,
"loss": 0.7655,
"step": 4310
},
{
"epoch": 0.7381358255748804,
"grad_norm": 21.281936645507812,
"learning_rate": 3.2527307963354476e-06,
"loss": 0.7732,
"step": 4320
},
{
"epoch": 0.7398444733192668,
"grad_norm": 22.189983367919922,
"learning_rate": 3.248326286116984e-06,
"loss": 0.766,
"step": 4330
},
{
"epoch": 0.7415531210636531,
"grad_norm": 20.438308715820312,
"learning_rate": 3.24392177589852e-06,
"loss": 0.7765,
"step": 4340
},
{
"epoch": 0.7432617688080394,
"grad_norm": 23.522388458251953,
"learning_rate": 3.239517265680057e-06,
"loss": 0.7617,
"step": 4350
},
{
"epoch": 0.7449704165524257,
"grad_norm": 27.77216148376465,
"learning_rate": 3.235112755461593e-06,
"loss": 0.7321,
"step": 4360
},
{
"epoch": 0.746679064296812,
"grad_norm": 25.899330139160156,
"learning_rate": 3.2307082452431294e-06,
"loss": 0.705,
"step": 4370
},
{
"epoch": 0.7483877120411982,
"grad_norm": 24.98331069946289,
"learning_rate": 3.2263037350246656e-06,
"loss": 0.7356,
"step": 4380
},
{
"epoch": 0.7500963597855845,
"grad_norm": 22.49882698059082,
"learning_rate": 3.221899224806202e-06,
"loss": 0.751,
"step": 4390
},
{
"epoch": 0.7518050075299709,
"grad_norm": 21.93841552734375,
"learning_rate": 3.217494714587738e-06,
"loss": 0.7846,
"step": 4400
},
{
"epoch": 0.7535136552743572,
"grad_norm": 16.464521408081055,
"learning_rate": 3.213090204369275e-06,
"loss": 0.7572,
"step": 4410
},
{
"epoch": 0.7552223030187435,
"grad_norm": 22.928815841674805,
"learning_rate": 3.2086856941508103e-06,
"loss": 0.7004,
"step": 4420
},
{
"epoch": 0.7569309507631298,
"grad_norm": 28.229320526123047,
"learning_rate": 3.204281183932347e-06,
"loss": 0.7712,
"step": 4430
},
{
"epoch": 0.758639598507516,
"grad_norm": 17.726673126220703,
"learning_rate": 3.1998766737138832e-06,
"loss": 0.7637,
"step": 4440
},
{
"epoch": 0.7603482462519023,
"grad_norm": 23.8514404296875,
"learning_rate": 3.1954721634954195e-06,
"loss": 0.7547,
"step": 4450
},
{
"epoch": 0.7620568939962886,
"grad_norm": 24.728208541870117,
"learning_rate": 3.1910676532769557e-06,
"loss": 0.7626,
"step": 4460
},
{
"epoch": 0.7637655417406749,
"grad_norm": 20.695667266845703,
"learning_rate": 3.186663143058492e-06,
"loss": 0.7339,
"step": 4470
},
{
"epoch": 0.7654741894850613,
"grad_norm": 21.278423309326172,
"learning_rate": 3.1822586328400283e-06,
"loss": 0.7922,
"step": 4480
},
{
"epoch": 0.7671828372294476,
"grad_norm": 23.316635131835938,
"learning_rate": 3.177854122621565e-06,
"loss": 0.7576,
"step": 4490
},
{
"epoch": 0.7688914849738339,
"grad_norm": 24.75078010559082,
"learning_rate": 3.1734496124031012e-06,
"loss": 0.7455,
"step": 4500
},
{
"epoch": 0.7706001327182201,
"grad_norm": 23.322919845581055,
"learning_rate": 3.1690451021846375e-06,
"loss": 0.7637,
"step": 4510
},
{
"epoch": 0.7723087804626064,
"grad_norm": 26.70413589477539,
"learning_rate": 3.1646405919661738e-06,
"loss": 0.6983,
"step": 4520
},
{
"epoch": 0.7740174282069927,
"grad_norm": 21.821128845214844,
"learning_rate": 3.16023608174771e-06,
"loss": 0.7044,
"step": 4530
},
{
"epoch": 0.775726075951379,
"grad_norm": 19.717451095581055,
"learning_rate": 3.1558315715292463e-06,
"loss": 0.7294,
"step": 4540
},
{
"epoch": 0.7774347236957654,
"grad_norm": 21.586071014404297,
"learning_rate": 3.151427061310782e-06,
"loss": 0.77,
"step": 4550
},
{
"epoch": 0.7791433714401517,
"grad_norm": 25.673486709594727,
"learning_rate": 3.1470225510923184e-06,
"loss": 0.7258,
"step": 4560
},
{
"epoch": 0.780852019184538,
"grad_norm": 27.769350051879883,
"learning_rate": 3.142618040873855e-06,
"loss": 0.797,
"step": 4570
},
{
"epoch": 0.7825606669289242,
"grad_norm": 20.539966583251953,
"learning_rate": 3.1382135306553914e-06,
"loss": 0.7611,
"step": 4580
},
{
"epoch": 0.7842693146733105,
"grad_norm": 21.524412155151367,
"learning_rate": 3.1338090204369276e-06,
"loss": 0.7666,
"step": 4590
},
{
"epoch": 0.7859779624176968,
"grad_norm": 22.3591365814209,
"learning_rate": 3.129404510218464e-06,
"loss": 0.7882,
"step": 4600
},
{
"epoch": 0.7876866101620831,
"grad_norm": 23.00992202758789,
"learning_rate": 3.125e-06,
"loss": 0.7842,
"step": 4610
},
{
"epoch": 0.7893952579064695,
"grad_norm": 16.515499114990234,
"learning_rate": 3.1205954897815364e-06,
"loss": 0.7369,
"step": 4620
},
{
"epoch": 0.7911039056508558,
"grad_norm": 21.174406051635742,
"learning_rate": 3.116190979563073e-06,
"loss": 0.738,
"step": 4630
},
{
"epoch": 0.792812553395242,
"grad_norm": 23.586978912353516,
"learning_rate": 3.1117864693446094e-06,
"loss": 0.6997,
"step": 4640
},
{
"epoch": 0.7945212011396283,
"grad_norm": 32.21963882446289,
"learning_rate": 3.1073819591261456e-06,
"loss": 0.7019,
"step": 4650
},
{
"epoch": 0.7962298488840146,
"grad_norm": 25.9871883392334,
"learning_rate": 3.102977448907682e-06,
"loss": 0.7083,
"step": 4660
},
{
"epoch": 0.7979384966284009,
"grad_norm": 24.332395553588867,
"learning_rate": 3.098572938689218e-06,
"loss": 0.7863,
"step": 4670
},
{
"epoch": 0.7996471443727872,
"grad_norm": 20.869014739990234,
"learning_rate": 3.094168428470754e-06,
"loss": 0.7582,
"step": 4680
},
{
"epoch": 0.8013557921171734,
"grad_norm": 22.537940979003906,
"learning_rate": 3.0897639182522903e-06,
"loss": 0.806,
"step": 4690
},
{
"epoch": 0.8030644398615598,
"grad_norm": 26.17819595336914,
"learning_rate": 3.0853594080338265e-06,
"loss": 0.743,
"step": 4700
},
{
"epoch": 0.8047730876059461,
"grad_norm": 23.158397674560547,
"learning_rate": 3.0809548978153632e-06,
"loss": 0.7075,
"step": 4710
},
{
"epoch": 0.8064817353503324,
"grad_norm": 18.984607696533203,
"learning_rate": 3.0765503875968995e-06,
"loss": 0.7483,
"step": 4720
},
{
"epoch": 0.8081903830947187,
"grad_norm": 22.03697967529297,
"learning_rate": 3.0721458773784358e-06,
"loss": 0.7295,
"step": 4730
},
{
"epoch": 0.809899030839105,
"grad_norm": 19.310800552368164,
"learning_rate": 3.067741367159972e-06,
"loss": 0.7566,
"step": 4740
},
{
"epoch": 0.8116076785834913,
"grad_norm": 27.38188934326172,
"learning_rate": 3.0633368569415083e-06,
"loss": 0.7487,
"step": 4750
},
{
"epoch": 0.8133163263278775,
"grad_norm": 30.696491241455078,
"learning_rate": 3.0589323467230446e-06,
"loss": 0.7468,
"step": 4760
},
{
"epoch": 0.8150249740722639,
"grad_norm": 25.93939208984375,
"learning_rate": 3.0545278365045812e-06,
"loss": 0.7608,
"step": 4770
},
{
"epoch": 0.8167336218166502,
"grad_norm": 24.9782772064209,
"learning_rate": 3.0501233262861175e-06,
"loss": 0.7327,
"step": 4780
},
{
"epoch": 0.8184422695610365,
"grad_norm": 19.76726531982422,
"learning_rate": 3.0457188160676538e-06,
"loss": 0.7585,
"step": 4790
},
{
"epoch": 0.8201509173054228,
"grad_norm": 24.16695785522461,
"learning_rate": 3.04131430584919e-06,
"loss": 0.7812,
"step": 4800
},
{
"epoch": 0.8218595650498091,
"grad_norm": 25.34935188293457,
"learning_rate": 3.036909795630726e-06,
"loss": 0.718,
"step": 4810
},
{
"epoch": 0.8235682127941953,
"grad_norm": 21.88555335998535,
"learning_rate": 3.032505285412262e-06,
"loss": 0.7264,
"step": 4820
},
{
"epoch": 0.8252768605385816,
"grad_norm": 28.31941795349121,
"learning_rate": 3.0281007751937984e-06,
"loss": 0.7012,
"step": 4830
},
{
"epoch": 0.8269855082829679,
"grad_norm": 24.483379364013672,
"learning_rate": 3.0236962649753347e-06,
"loss": 0.6828,
"step": 4840
},
{
"epoch": 0.8286941560273543,
"grad_norm": 22.461471557617188,
"learning_rate": 3.0192917547568714e-06,
"loss": 0.7266,
"step": 4850
},
{
"epoch": 0.8304028037717406,
"grad_norm": 19.53203773498535,
"learning_rate": 3.0148872445384076e-06,
"loss": 0.707,
"step": 4860
},
{
"epoch": 0.8321114515161269,
"grad_norm": 26.503684997558594,
"learning_rate": 3.010482734319944e-06,
"loss": 0.7399,
"step": 4870
},
{
"epoch": 0.8338200992605131,
"grad_norm": 25.25548553466797,
"learning_rate": 3.00607822410148e-06,
"loss": 0.7094,
"step": 4880
},
{
"epoch": 0.8355287470048994,
"grad_norm": 27.260940551757812,
"learning_rate": 3.0016737138830164e-06,
"loss": 0.7311,
"step": 4890
},
{
"epoch": 0.8372373947492857,
"grad_norm": 22.992063522338867,
"learning_rate": 2.9972692036645527e-06,
"loss": 0.7389,
"step": 4900
},
{
"epoch": 0.838946042493672,
"grad_norm": 24.592796325683594,
"learning_rate": 2.9928646934460894e-06,
"loss": 0.753,
"step": 4910
},
{
"epoch": 0.8406546902380584,
"grad_norm": 21.972124099731445,
"learning_rate": 2.9884601832276256e-06,
"loss": 0.7347,
"step": 4920
},
{
"epoch": 0.8423633379824447,
"grad_norm": 21.52046775817871,
"learning_rate": 2.984055673009162e-06,
"loss": 0.6925,
"step": 4930
},
{
"epoch": 0.844071985726831,
"grad_norm": 26.47010040283203,
"learning_rate": 2.9796511627906977e-06,
"loss": 0.6887,
"step": 4940
},
{
"epoch": 0.8457806334712172,
"grad_norm": 31.673635482788086,
"learning_rate": 2.975246652572234e-06,
"loss": 0.7223,
"step": 4950
},
{
"epoch": 0.8474892812156035,
"grad_norm": 24.043643951416016,
"learning_rate": 2.9708421423537703e-06,
"loss": 0.7438,
"step": 4960
},
{
"epoch": 0.8491979289599898,
"grad_norm": 21.57198715209961,
"learning_rate": 2.9664376321353065e-06,
"loss": 0.7187,
"step": 4970
},
{
"epoch": 0.8509065767043761,
"grad_norm": 27.323469161987305,
"learning_rate": 2.962033121916843e-06,
"loss": 0.7423,
"step": 4980
},
{
"epoch": 0.8526152244487625,
"grad_norm": 29.45259666442871,
"learning_rate": 2.9576286116983795e-06,
"loss": 0.7215,
"step": 4990
},
{
"epoch": 0.8543238721931488,
"grad_norm": 25.042516708374023,
"learning_rate": 2.9532241014799158e-06,
"loss": 0.7226,
"step": 5000
},
{
"epoch": 0.856032519937535,
"grad_norm": 20.377517700195312,
"learning_rate": 2.948819591261452e-06,
"loss": 0.7774,
"step": 5010
},
{
"epoch": 0.8577411676819213,
"grad_norm": 19.54035758972168,
"learning_rate": 2.9444150810429883e-06,
"loss": 0.7994,
"step": 5020
},
{
"epoch": 0.8594498154263076,
"grad_norm": 23.07032012939453,
"learning_rate": 2.9400105708245246e-06,
"loss": 0.7022,
"step": 5030
},
{
"epoch": 0.8611584631706939,
"grad_norm": 23.482563018798828,
"learning_rate": 2.935606060606061e-06,
"loss": 0.7228,
"step": 5040
},
{
"epoch": 0.8628671109150802,
"grad_norm": 20.6116886138916,
"learning_rate": 2.9312015503875975e-06,
"loss": 0.6769,
"step": 5050
},
{
"epoch": 0.8645757586594665,
"grad_norm": 23.133941650390625,
"learning_rate": 2.9267970401691338e-06,
"loss": 0.7216,
"step": 5060
},
{
"epoch": 0.8662844064038528,
"grad_norm": 19.501455307006836,
"learning_rate": 2.9223925299506696e-06,
"loss": 0.7417,
"step": 5070
},
{
"epoch": 0.8679930541482391,
"grad_norm": 20.669921875,
"learning_rate": 2.917988019732206e-06,
"loss": 0.7187,
"step": 5080
},
{
"epoch": 0.8697017018926254,
"grad_norm": 24.454565048217773,
"learning_rate": 2.913583509513742e-06,
"loss": 0.6937,
"step": 5090
},
{
"epoch": 0.8714103496370117,
"grad_norm": 26.569597244262695,
"learning_rate": 2.9091789992952784e-06,
"loss": 0.7769,
"step": 5100
},
{
"epoch": 0.873118997381398,
"grad_norm": 23.066076278686523,
"learning_rate": 2.9047744890768147e-06,
"loss": 0.7607,
"step": 5110
},
{
"epoch": 0.8748276451257843,
"grad_norm": 25.31006622314453,
"learning_rate": 2.900369978858351e-06,
"loss": 0.692,
"step": 5120
},
{
"epoch": 0.8765362928701705,
"grad_norm": 24.027446746826172,
"learning_rate": 2.8959654686398876e-06,
"loss": 0.6777,
"step": 5130
},
{
"epoch": 0.8782449406145569,
"grad_norm": 21.320232391357422,
"learning_rate": 2.891560958421424e-06,
"loss": 0.7671,
"step": 5140
},
{
"epoch": 0.8799535883589432,
"grad_norm": 22.37028694152832,
"learning_rate": 2.88715644820296e-06,
"loss": 0.7451,
"step": 5150
},
{
"epoch": 0.8816622361033295,
"grad_norm": 17.270870208740234,
"learning_rate": 2.8827519379844964e-06,
"loss": 0.7202,
"step": 5160
},
{
"epoch": 0.8833708838477158,
"grad_norm": 32.7978401184082,
"learning_rate": 2.8783474277660327e-06,
"loss": 0.6955,
"step": 5170
},
{
"epoch": 0.8850795315921021,
"grad_norm": 30.631633758544922,
"learning_rate": 2.873942917547569e-06,
"loss": 0.7657,
"step": 5180
},
{
"epoch": 0.8867881793364883,
"grad_norm": 21.487262725830078,
"learning_rate": 2.8695384073291056e-06,
"loss": 0.723,
"step": 5190
},
{
"epoch": 0.8884968270808746,
"grad_norm": 25.697208404541016,
"learning_rate": 2.865133897110641e-06,
"loss": 0.7678,
"step": 5200
},
{
"epoch": 0.8902054748252609,
"grad_norm": 17.121862411499023,
"learning_rate": 2.8607293868921778e-06,
"loss": 0.736,
"step": 5210
},
{
"epoch": 0.8919141225696473,
"grad_norm": 22.777664184570312,
"learning_rate": 2.856324876673714e-06,
"loss": 0.6902,
"step": 5220
},
{
"epoch": 0.8936227703140336,
"grad_norm": 21.997692108154297,
"learning_rate": 2.8519203664552503e-06,
"loss": 0.7177,
"step": 5230
},
{
"epoch": 0.8953314180584199,
"grad_norm": 27.831954956054688,
"learning_rate": 2.8475158562367866e-06,
"loss": 0.7061,
"step": 5240
},
{
"epoch": 0.8970400658028062,
"grad_norm": 20.20841407775879,
"learning_rate": 2.843111346018323e-06,
"loss": 0.7103,
"step": 5250
},
{
"epoch": 0.8987487135471924,
"grad_norm": 22.670791625976562,
"learning_rate": 2.838706835799859e-06,
"loss": 0.6827,
"step": 5260
},
{
"epoch": 0.9004573612915787,
"grad_norm": 19.863046646118164,
"learning_rate": 2.8343023255813958e-06,
"loss": 0.7659,
"step": 5270
},
{
"epoch": 0.902166009035965,
"grad_norm": 21.82082176208496,
"learning_rate": 2.829897815362932e-06,
"loss": 0.6768,
"step": 5280
},
{
"epoch": 0.9038746567803514,
"grad_norm": 20.27167320251465,
"learning_rate": 2.8254933051444683e-06,
"loss": 0.7313,
"step": 5290
},
{
"epoch": 0.9055833045247377,
"grad_norm": 23.12434196472168,
"learning_rate": 2.8210887949260046e-06,
"loss": 0.7167,
"step": 5300
},
{
"epoch": 0.907291952269124,
"grad_norm": 28.580188751220703,
"learning_rate": 2.816684284707541e-06,
"loss": 0.707,
"step": 5310
},
{
"epoch": 0.9090006000135102,
"grad_norm": 21.65957260131836,
"learning_rate": 2.812279774489077e-06,
"loss": 0.7243,
"step": 5320
},
{
"epoch": 0.9107092477578965,
"grad_norm": 18.040910720825195,
"learning_rate": 2.807875264270613e-06,
"loss": 0.714,
"step": 5330
},
{
"epoch": 0.9124178955022828,
"grad_norm": 25.710729598999023,
"learning_rate": 2.803470754052149e-06,
"loss": 0.7092,
"step": 5340
},
{
"epoch": 0.9141265432466691,
"grad_norm": 23.372407913208008,
"learning_rate": 2.799066243833686e-06,
"loss": 0.6511,
"step": 5350
},
{
"epoch": 0.9158351909910555,
"grad_norm": 24.783931732177734,
"learning_rate": 2.794661733615222e-06,
"loss": 0.6906,
"step": 5360
},
{
"epoch": 0.9175438387354418,
"grad_norm": 25.27882194519043,
"learning_rate": 2.7902572233967584e-06,
"loss": 0.686,
"step": 5370
},
{
"epoch": 0.919252486479828,
"grad_norm": 22.388492584228516,
"learning_rate": 2.7858527131782947e-06,
"loss": 0.6987,
"step": 5380
},
{
"epoch": 0.9209611342242143,
"grad_norm": 20.66554832458496,
"learning_rate": 2.781448202959831e-06,
"loss": 0.6719,
"step": 5390
},
{
"epoch": 0.9226697819686006,
"grad_norm": 21.613603591918945,
"learning_rate": 2.7770436927413672e-06,
"loss": 0.7096,
"step": 5400
},
{
"epoch": 0.9243784297129869,
"grad_norm": 22.45414161682129,
"learning_rate": 2.772639182522904e-06,
"loss": 0.6965,
"step": 5410
},
{
"epoch": 0.9260870774573732,
"grad_norm": 22.07771110534668,
"learning_rate": 2.76823467230444e-06,
"loss": 0.6987,
"step": 5420
},
{
"epoch": 0.9277957252017595,
"grad_norm": 22.09317970275879,
"learning_rate": 2.7638301620859764e-06,
"loss": 0.7033,
"step": 5430
},
{
"epoch": 0.9295043729461459,
"grad_norm": 22.29112434387207,
"learning_rate": 2.7594256518675127e-06,
"loss": 0.7126,
"step": 5440
},
{
"epoch": 0.9312130206905321,
"grad_norm": 25.35603141784668,
"learning_rate": 2.755021141649049e-06,
"loss": 0.6872,
"step": 5450
},
{
"epoch": 0.9329216684349184,
"grad_norm": 26.43601417541504,
"learning_rate": 2.750616631430585e-06,
"loss": 0.6884,
"step": 5460
},
{
"epoch": 0.9346303161793047,
"grad_norm": 22.09392738342285,
"learning_rate": 2.746212121212121e-06,
"loss": 0.7547,
"step": 5470
},
{
"epoch": 0.936338963923691,
"grad_norm": 18.14749526977539,
"learning_rate": 2.7418076109936578e-06,
"loss": 0.7237,
"step": 5480
},
{
"epoch": 0.9380476116680773,
"grad_norm": 25.575664520263672,
"learning_rate": 2.737403100775194e-06,
"loss": 0.7274,
"step": 5490
},
{
"epoch": 0.9397562594124635,
"grad_norm": 30.890182495117188,
"learning_rate": 2.7329985905567303e-06,
"loss": 0.657,
"step": 5500
},
{
"epoch": 0.9414649071568499,
"grad_norm": 25.72110939025879,
"learning_rate": 2.7285940803382666e-06,
"loss": 0.6839,
"step": 5510
},
{
"epoch": 0.9431735549012362,
"grad_norm": 19.578693389892578,
"learning_rate": 2.724189570119803e-06,
"loss": 0.7292,
"step": 5520
},
{
"epoch": 0.9448822026456225,
"grad_norm": 19.946809768676758,
"learning_rate": 2.719785059901339e-06,
"loss": 0.7023,
"step": 5530
},
{
"epoch": 0.9465908503900088,
"grad_norm": 29.049034118652344,
"learning_rate": 2.7153805496828758e-06,
"loss": 0.6711,
"step": 5540
},
{
"epoch": 0.9482994981343951,
"grad_norm": 26.28841781616211,
"learning_rate": 2.710976039464412e-06,
"loss": 0.6878,
"step": 5550
},
{
"epoch": 0.9500081458787814,
"grad_norm": 28.58267593383789,
"learning_rate": 2.7065715292459483e-06,
"loss": 0.6849,
"step": 5560
},
{
"epoch": 0.9517167936231676,
"grad_norm": 23.82330894470215,
"learning_rate": 2.7021670190274846e-06,
"loss": 0.6599,
"step": 5570
},
{
"epoch": 0.9534254413675539,
"grad_norm": 23.920379638671875,
"learning_rate": 2.697762508809021e-06,
"loss": 0.7053,
"step": 5580
},
{
"epoch": 0.9551340891119403,
"grad_norm": 21.284543991088867,
"learning_rate": 2.6933579985905567e-06,
"loss": 0.6852,
"step": 5590
},
{
"epoch": 0.9568427368563266,
"grad_norm": 25.53280258178711,
"learning_rate": 2.688953488372093e-06,
"loss": 0.7453,
"step": 5600
},
{
"epoch": 0.9585513846007129,
"grad_norm": 25.06231689453125,
"learning_rate": 2.684548978153629e-06,
"loss": 0.7138,
"step": 5610
},
{
"epoch": 0.9602600323450992,
"grad_norm": 23.394676208496094,
"learning_rate": 2.680144467935166e-06,
"loss": 0.7542,
"step": 5620
},
{
"epoch": 0.9619686800894854,
"grad_norm": 18.197351455688477,
"learning_rate": 2.675739957716702e-06,
"loss": 0.6856,
"step": 5630
},
{
"epoch": 0.9636773278338717,
"grad_norm": 20.14853286743164,
"learning_rate": 2.6713354474982384e-06,
"loss": 0.7383,
"step": 5640
},
{
"epoch": 0.965385975578258,
"grad_norm": 19.874074935913086,
"learning_rate": 2.6669309372797747e-06,
"loss": 0.7068,
"step": 5650
},
{
"epoch": 0.9670946233226444,
"grad_norm": 21.235719680786133,
"learning_rate": 2.662526427061311e-06,
"loss": 0.7039,
"step": 5660
},
{
"epoch": 0.9688032710670307,
"grad_norm": 24.528348922729492,
"learning_rate": 2.6581219168428472e-06,
"loss": 0.7245,
"step": 5670
},
{
"epoch": 0.970511918811417,
"grad_norm": 23.650028228759766,
"learning_rate": 2.653717406624384e-06,
"loss": 0.6733,
"step": 5680
},
{
"epoch": 0.9722205665558032,
"grad_norm": 24.60836410522461,
"learning_rate": 2.64931289640592e-06,
"loss": 0.6852,
"step": 5690
},
{
"epoch": 0.9739292143001895,
"grad_norm": 21.282657623291016,
"learning_rate": 2.6449083861874564e-06,
"loss": 0.6296,
"step": 5700
},
{
"epoch": 0.9756378620445758,
"grad_norm": 25.645389556884766,
"learning_rate": 2.6405038759689927e-06,
"loss": 0.716,
"step": 5710
},
{
"epoch": 0.9773465097889621,
"grad_norm": 27.467487335205078,
"learning_rate": 2.6360993657505286e-06,
"loss": 0.6401,
"step": 5720
},
{
"epoch": 0.9790551575333485,
"grad_norm": 24.290742874145508,
"learning_rate": 2.631694855532065e-06,
"loss": 0.7524,
"step": 5730
},
{
"epoch": 0.9807638052777348,
"grad_norm": 25.512126922607422,
"learning_rate": 2.627290345313601e-06,
"loss": 0.7001,
"step": 5740
},
{
"epoch": 0.982472453022121,
"grad_norm": 25.331077575683594,
"learning_rate": 2.6228858350951373e-06,
"loss": 0.7483,
"step": 5750
},
{
"epoch": 0.9841811007665073,
"grad_norm": 27.04343032836914,
"learning_rate": 2.618481324876674e-06,
"loss": 0.6901,
"step": 5760
},
{
"epoch": 0.9858897485108936,
"grad_norm": 22.927780151367188,
"learning_rate": 2.6140768146582103e-06,
"loss": 0.6261,
"step": 5770
},
{
"epoch": 0.9875983962552799,
"grad_norm": 19.579212188720703,
"learning_rate": 2.6096723044397466e-06,
"loss": 0.6957,
"step": 5780
},
{
"epoch": 0.9893070439996662,
"grad_norm": 29.58092498779297,
"learning_rate": 2.605267794221283e-06,
"loss": 0.7329,
"step": 5790
},
{
"epoch": 0.9910156917440525,
"grad_norm": 17.51485252380371,
"learning_rate": 2.600863284002819e-06,
"loss": 0.6811,
"step": 5800
},
{
"epoch": 0.9927243394884389,
"grad_norm": 24.039936065673828,
"learning_rate": 2.5964587737843554e-06,
"loss": 0.7713,
"step": 5810
},
{
"epoch": 0.9944329872328251,
"grad_norm": 21.120576858520508,
"learning_rate": 2.592054263565892e-06,
"loss": 0.675,
"step": 5820
},
{
"epoch": 0.9961416349772114,
"grad_norm": 21.085262298583984,
"learning_rate": 2.5876497533474283e-06,
"loss": 0.6972,
"step": 5830
},
{
"epoch": 0.9978502827215977,
"grad_norm": 23.86699104309082,
"learning_rate": 2.5832452431289646e-06,
"loss": 0.7248,
"step": 5840
},
{
"epoch": 0.999558930465984,
"grad_norm": 22.0477237701416,
"learning_rate": 2.5788407329105004e-06,
"loss": 0.6543,
"step": 5850
},
{
"epoch": 0.9999006600148612,
"eval_loss": 0.848216712474823,
"eval_runtime": 139.044,
"eval_samples_per_second": 70.906,
"eval_steps_per_second": 8.868,
"step": 5852
},
{
"epoch": 1.001366918195509,
"grad_norm": 17.0991268157959,
"learning_rate": 2.5744362226920367e-06,
"loss": 0.6552,
"step": 5860
},
{
"epoch": 1.0030755659398953,
"grad_norm": 25.986722946166992,
"learning_rate": 2.570031712473573e-06,
"loss": 0.4587,
"step": 5870
},
{
"epoch": 1.0047842136842817,
"grad_norm": 30.915128707885742,
"learning_rate": 2.5656272022551092e-06,
"loss": 0.429,
"step": 5880
},
{
"epoch": 1.006492861428668,
"grad_norm": 25.264280319213867,
"learning_rate": 2.5612226920366455e-06,
"loss": 0.4217,
"step": 5890
},
{
"epoch": 1.0082015091730543,
"grad_norm": 30.28125,
"learning_rate": 2.556818181818182e-06,
"loss": 0.4479,
"step": 5900
},
{
"epoch": 1.0099101569174405,
"grad_norm": 22.753318786621094,
"learning_rate": 2.5524136715997184e-06,
"loss": 0.4314,
"step": 5910
},
{
"epoch": 1.0116188046618269,
"grad_norm": 31.079761505126953,
"learning_rate": 2.5480091613812547e-06,
"loss": 0.4476,
"step": 5920
},
{
"epoch": 1.013327452406213,
"grad_norm": 23.833829879760742,
"learning_rate": 2.543604651162791e-06,
"loss": 0.4658,
"step": 5930
},
{
"epoch": 1.0150361001505994,
"grad_norm": 29.113054275512695,
"learning_rate": 2.5392001409443272e-06,
"loss": 0.4135,
"step": 5940
},
{
"epoch": 1.0167447478949858,
"grad_norm": 33.021793365478516,
"learning_rate": 2.5347956307258635e-06,
"loss": 0.455,
"step": 5950
},
{
"epoch": 1.018453395639372,
"grad_norm": 25.65928840637207,
"learning_rate": 2.5303911205074e-06,
"loss": 0.4224,
"step": 5960
},
{
"epoch": 1.0201620433837584,
"grad_norm": 29.965845108032227,
"learning_rate": 2.5259866102889365e-06,
"loss": 0.4035,
"step": 5970
},
{
"epoch": 1.0218706911281445,
"grad_norm": 27.758283615112305,
"learning_rate": 2.5215821000704723e-06,
"loss": 0.4181,
"step": 5980
},
{
"epoch": 1.023579338872531,
"grad_norm": 27.05050277709961,
"learning_rate": 2.5171775898520086e-06,
"loss": 0.4385,
"step": 5990
},
{
"epoch": 1.0252879866169171,
"grad_norm": 32.60606002807617,
"learning_rate": 2.512773079633545e-06,
"loss": 0.4366,
"step": 6000
},
{
"epoch": 1.0269966343613035,
"grad_norm": 26.773223876953125,
"learning_rate": 2.508368569415081e-06,
"loss": 0.3887,
"step": 6010
},
{
"epoch": 1.02870528210569,
"grad_norm": 28.4731502532959,
"learning_rate": 2.5039640591966174e-06,
"loss": 0.4236,
"step": 6020
},
{
"epoch": 1.030413929850076,
"grad_norm": 27.751144409179688,
"learning_rate": 2.4995595489781536e-06,
"loss": 0.4405,
"step": 6030
},
{
"epoch": 1.0321225775944625,
"grad_norm": 30.297574996948242,
"learning_rate": 2.4951550387596903e-06,
"loss": 0.4587,
"step": 6040
},
{
"epoch": 1.0338312253388486,
"grad_norm": 27.601472854614258,
"learning_rate": 2.4907505285412266e-06,
"loss": 0.4233,
"step": 6050
},
{
"epoch": 1.035539873083235,
"grad_norm": 28.992273330688477,
"learning_rate": 2.486346018322763e-06,
"loss": 0.3768,
"step": 6060
},
{
"epoch": 1.0372485208276212,
"grad_norm": 30.652511596679688,
"learning_rate": 2.481941508104299e-06,
"loss": 0.4453,
"step": 6070
},
{
"epoch": 1.0389571685720076,
"grad_norm": 27.534353256225586,
"learning_rate": 2.4775369978858354e-06,
"loss": 0.4561,
"step": 6080
},
{
"epoch": 1.0406658163163938,
"grad_norm": 33.124420166015625,
"learning_rate": 2.4731324876673716e-06,
"loss": 0.4058,
"step": 6090
},
{
"epoch": 1.0423744640607802,
"grad_norm": 27.169292449951172,
"learning_rate": 2.468727977448908e-06,
"loss": 0.3672,
"step": 6100
},
{
"epoch": 1.0440831118051666,
"grad_norm": 34.9072151184082,
"learning_rate": 2.464323467230444e-06,
"loss": 0.3951,
"step": 6110
},
{
"epoch": 1.0457917595495527,
"grad_norm": 19.93570899963379,
"learning_rate": 2.4599189570119804e-06,
"loss": 0.4027,
"step": 6120
},
{
"epoch": 1.0475004072939391,
"grad_norm": 20.85097312927246,
"learning_rate": 2.4555144467935167e-06,
"loss": 0.3867,
"step": 6130
},
{
"epoch": 1.0492090550383253,
"grad_norm": 39.88778305053711,
"learning_rate": 2.451109936575053e-06,
"loss": 0.4293,
"step": 6140
},
{
"epoch": 1.0509177027827117,
"grad_norm": 24.23454475402832,
"learning_rate": 2.4467054263565892e-06,
"loss": 0.4324,
"step": 6150
},
{
"epoch": 1.0526263505270979,
"grad_norm": 26.025646209716797,
"learning_rate": 2.4423009161381255e-06,
"loss": 0.4314,
"step": 6160
},
{
"epoch": 1.0543349982714842,
"grad_norm": 32.79511260986328,
"learning_rate": 2.4378964059196618e-06,
"loss": 0.3892,
"step": 6170
},
{
"epoch": 1.0560436460158706,
"grad_norm": 29.5976505279541,
"learning_rate": 2.4334918957011984e-06,
"loss": 0.4593,
"step": 6180
},
{
"epoch": 1.0577522937602568,
"grad_norm": 31.08228874206543,
"learning_rate": 2.4290873854827347e-06,
"loss": 0.3868,
"step": 6190
},
{
"epoch": 1.0594609415046432,
"grad_norm": 32.12119674682617,
"learning_rate": 2.424682875264271e-06,
"loss": 0.4333,
"step": 6200
},
{
"epoch": 1.0611695892490294,
"grad_norm": 41.33872985839844,
"learning_rate": 2.420278365045807e-06,
"loss": 0.4146,
"step": 6210
},
{
"epoch": 1.0628782369934158,
"grad_norm": 41.04495620727539,
"learning_rate": 2.4158738548273435e-06,
"loss": 0.465,
"step": 6220
},
{
"epoch": 1.064586884737802,
"grad_norm": 36.077674865722656,
"learning_rate": 2.4114693446088798e-06,
"loss": 0.4483,
"step": 6230
},
{
"epoch": 1.0662955324821883,
"grad_norm": 27.00971221923828,
"learning_rate": 2.407064834390416e-06,
"loss": 0.4006,
"step": 6240
},
{
"epoch": 1.0680041802265747,
"grad_norm": 26.599790573120117,
"learning_rate": 2.4026603241719523e-06,
"loss": 0.4588,
"step": 6250
},
{
"epoch": 1.069712827970961,
"grad_norm": 32.89334487915039,
"learning_rate": 2.3982558139534886e-06,
"loss": 0.4048,
"step": 6260
},
{
"epoch": 1.0714214757153473,
"grad_norm": 28.60340118408203,
"learning_rate": 2.393851303735025e-06,
"loss": 0.4211,
"step": 6270
},
{
"epoch": 1.0731301234597335,
"grad_norm": 33.43773651123047,
"learning_rate": 2.389446793516561e-06,
"loss": 0.449,
"step": 6280
},
{
"epoch": 1.0748387712041199,
"grad_norm": 24.94864273071289,
"learning_rate": 2.3850422832980974e-06,
"loss": 0.4117,
"step": 6290
},
{
"epoch": 1.076547418948506,
"grad_norm": 40.33943557739258,
"learning_rate": 2.3806377730796336e-06,
"loss": 0.399,
"step": 6300
},
{
"epoch": 1.0782560666928924,
"grad_norm": 21.64677619934082,
"learning_rate": 2.37623326286117e-06,
"loss": 0.3691,
"step": 6310
},
{
"epoch": 1.0799647144372788,
"grad_norm": 24.09543800354004,
"learning_rate": 2.3718287526427066e-06,
"loss": 0.4372,
"step": 6320
},
{
"epoch": 1.081673362181665,
"grad_norm": 38.64820861816406,
"learning_rate": 2.367424242424243e-06,
"loss": 0.4074,
"step": 6330
},
{
"epoch": 1.0833820099260514,
"grad_norm": 25.985990524291992,
"learning_rate": 2.3630197322057787e-06,
"loss": 0.4339,
"step": 6340
},
{
"epoch": 1.0850906576704376,
"grad_norm": 24.800357818603516,
"learning_rate": 2.358615221987315e-06,
"loss": 0.4702,
"step": 6350
},
{
"epoch": 1.086799305414824,
"grad_norm": 21.988859176635742,
"learning_rate": 2.3542107117688516e-06,
"loss": 0.4824,
"step": 6360
},
{
"epoch": 1.0885079531592101,
"grad_norm": 39.72243881225586,
"learning_rate": 2.349806201550388e-06,
"loss": 0.4188,
"step": 6370
},
{
"epoch": 1.0902166009035965,
"grad_norm": 23.119239807128906,
"learning_rate": 2.345401691331924e-06,
"loss": 0.4573,
"step": 6380
},
{
"epoch": 1.091925248647983,
"grad_norm": 20.915830612182617,
"learning_rate": 2.3409971811134604e-06,
"loss": 0.4256,
"step": 6390
},
{
"epoch": 1.093633896392369,
"grad_norm": 25.44793701171875,
"learning_rate": 2.3365926708949967e-06,
"loss": 0.416,
"step": 6400
},
{
"epoch": 1.0953425441367555,
"grad_norm": 34.23642349243164,
"learning_rate": 2.332188160676533e-06,
"loss": 0.3993,
"step": 6410
},
{
"epoch": 1.0970511918811416,
"grad_norm": 26.873048782348633,
"learning_rate": 2.3277836504580692e-06,
"loss": 0.4516,
"step": 6420
},
{
"epoch": 1.098759839625528,
"grad_norm": 29.566207885742188,
"learning_rate": 2.3233791402396055e-06,
"loss": 0.4385,
"step": 6430
},
{
"epoch": 1.1004684873699142,
"grad_norm": 38.95769500732422,
"learning_rate": 2.3189746300211418e-06,
"loss": 0.4334,
"step": 6440
},
{
"epoch": 1.1021771351143006,
"grad_norm": 22.23900604248047,
"learning_rate": 2.314570119802678e-06,
"loss": 0.4133,
"step": 6450
},
{
"epoch": 1.1038857828586868,
"grad_norm": 32.352516174316406,
"learning_rate": 2.3101656095842147e-06,
"loss": 0.4445,
"step": 6460
},
{
"epoch": 1.1055944306030732,
"grad_norm": 27.49093246459961,
"learning_rate": 2.3057610993657506e-06,
"loss": 0.458,
"step": 6470
},
{
"epoch": 1.1073030783474596,
"grad_norm": 22.2708740234375,
"learning_rate": 2.301356589147287e-06,
"loss": 0.4362,
"step": 6480
},
{
"epoch": 1.1090117260918457,
"grad_norm": 28.47286605834961,
"learning_rate": 2.296952078928823e-06,
"loss": 0.4389,
"step": 6490
},
{
"epoch": 1.1107203738362321,
"grad_norm": 33.60470199584961,
"learning_rate": 2.2925475687103598e-06,
"loss": 0.434,
"step": 6500
},
{
"epoch": 1.1124290215806183,
"grad_norm": 25.99380874633789,
"learning_rate": 2.288143058491896e-06,
"loss": 0.4128,
"step": 6510
},
{
"epoch": 1.1141376693250047,
"grad_norm": 23.311767578125,
"learning_rate": 2.2837385482734323e-06,
"loss": 0.3983,
"step": 6520
},
{
"epoch": 1.115846317069391,
"grad_norm": 42.49270248413086,
"learning_rate": 2.2793340380549686e-06,
"loss": 0.4046,
"step": 6530
},
{
"epoch": 1.1175549648137773,
"grad_norm": 34.39870834350586,
"learning_rate": 2.274929527836505e-06,
"loss": 0.4225,
"step": 6540
},
{
"epoch": 1.1192636125581636,
"grad_norm": 58.568817138671875,
"learning_rate": 2.270525017618041e-06,
"loss": 0.403,
"step": 6550
},
{
"epoch": 1.1209722603025498,
"grad_norm": 40.097511291503906,
"learning_rate": 2.2661205073995774e-06,
"loss": 0.4099,
"step": 6560
},
{
"epoch": 1.1226809080469362,
"grad_norm": 22.363500595092773,
"learning_rate": 2.2617159971811136e-06,
"loss": 0.4422,
"step": 6570
},
{
"epoch": 1.1243895557913224,
"grad_norm": 29.2000732421875,
"learning_rate": 2.25731148696265e-06,
"loss": 0.4302,
"step": 6580
},
{
"epoch": 1.1260982035357088,
"grad_norm": 25.729015350341797,
"learning_rate": 2.252906976744186e-06,
"loss": 0.4165,
"step": 6590
},
{
"epoch": 1.127806851280095,
"grad_norm": 41.546085357666016,
"learning_rate": 2.2485024665257224e-06,
"loss": 0.4259,
"step": 6600
},
{
"epoch": 1.1295154990244813,
"grad_norm": 27.8181095123291,
"learning_rate": 2.2440979563072587e-06,
"loss": 0.4418,
"step": 6610
},
{
"epoch": 1.1312241467688677,
"grad_norm": 27.532690048217773,
"learning_rate": 2.239693446088795e-06,
"loss": 0.4072,
"step": 6620
},
{
"epoch": 1.132932794513254,
"grad_norm": 38.94101333618164,
"learning_rate": 2.2352889358703312e-06,
"loss": 0.4003,
"step": 6630
},
{
"epoch": 1.1346414422576403,
"grad_norm": 28.348133087158203,
"learning_rate": 2.230884425651868e-06,
"loss": 0.3975,
"step": 6640
},
{
"epoch": 1.1363500900020265,
"grad_norm": 37.84804916381836,
"learning_rate": 2.226479915433404e-06,
"loss": 0.4254,
"step": 6650
},
{
"epoch": 1.1380587377464129,
"grad_norm": 26.082874298095703,
"learning_rate": 2.2220754052149404e-06,
"loss": 0.3791,
"step": 6660
},
{
"epoch": 1.139767385490799,
"grad_norm": 28.30021095275879,
"learning_rate": 2.2176708949964763e-06,
"loss": 0.4773,
"step": 6670
},
{
"epoch": 1.1414760332351854,
"grad_norm": 23.014328002929688,
"learning_rate": 2.213266384778013e-06,
"loss": 0.4389,
"step": 6680
},
{
"epoch": 1.1431846809795718,
"grad_norm": 22.675397872924805,
"learning_rate": 2.2088618745595492e-06,
"loss": 0.4072,
"step": 6690
},
{
"epoch": 1.144893328723958,
"grad_norm": 37.76887893676758,
"learning_rate": 2.2044573643410855e-06,
"loss": 0.4555,
"step": 6700
},
{
"epoch": 1.1466019764683444,
"grad_norm": 25.542505264282227,
"learning_rate": 2.2000528541226218e-06,
"loss": 0.421,
"step": 6710
},
{
"epoch": 1.1483106242127306,
"grad_norm": 26.092363357543945,
"learning_rate": 2.195648343904158e-06,
"loss": 0.433,
"step": 6720
},
{
"epoch": 1.150019271957117,
"grad_norm": 38.503875732421875,
"learning_rate": 2.1912438336856943e-06,
"loss": 0.4213,
"step": 6730
},
{
"epoch": 1.1517279197015031,
"grad_norm": 29.79505157470703,
"learning_rate": 2.1868393234672306e-06,
"loss": 0.3697,
"step": 6740
},
{
"epoch": 1.1534365674458895,
"grad_norm": 31.861398696899414,
"learning_rate": 2.182434813248767e-06,
"loss": 0.4401,
"step": 6750
},
{
"epoch": 1.155145215190276,
"grad_norm": 25.554759979248047,
"learning_rate": 2.178030303030303e-06,
"loss": 0.444,
"step": 6760
},
{
"epoch": 1.156853862934662,
"grad_norm": 25.79574966430664,
"learning_rate": 2.1736257928118394e-06,
"loss": 0.4126,
"step": 6770
},
{
"epoch": 1.1585625106790485,
"grad_norm": 30.038307189941406,
"learning_rate": 2.169221282593376e-06,
"loss": 0.4217,
"step": 6780
},
{
"epoch": 1.1602711584234346,
"grad_norm": 29.3577823638916,
"learning_rate": 2.1648167723749123e-06,
"loss": 0.4702,
"step": 6790
},
{
"epoch": 1.161979806167821,
"grad_norm": 28.816720962524414,
"learning_rate": 2.160412262156448e-06,
"loss": 0.4295,
"step": 6800
},
{
"epoch": 1.1636884539122072,
"grad_norm": 27.419452667236328,
"learning_rate": 2.1560077519379844e-06,
"loss": 0.4234,
"step": 6810
},
{
"epoch": 1.1653971016565936,
"grad_norm": 26.20050048828125,
"learning_rate": 2.151603241719521e-06,
"loss": 0.387,
"step": 6820
},
{
"epoch": 1.1671057494009798,
"grad_norm": 25.682668685913086,
"learning_rate": 2.1471987315010574e-06,
"loss": 0.3946,
"step": 6830
},
{
"epoch": 1.1688143971453662,
"grad_norm": 28.867799758911133,
"learning_rate": 2.1427942212825936e-06,
"loss": 0.4315,
"step": 6840
},
{
"epoch": 1.1705230448897526,
"grad_norm": 27.76809310913086,
"learning_rate": 2.13838971106413e-06,
"loss": 0.4153,
"step": 6850
},
{
"epoch": 1.1722316926341387,
"grad_norm": 55.45150375366211,
"learning_rate": 2.133985200845666e-06,
"loss": 0.4121,
"step": 6860
},
{
"epoch": 1.1739403403785251,
"grad_norm": 35.20660400390625,
"learning_rate": 2.1295806906272024e-06,
"loss": 0.4489,
"step": 6870
},
{
"epoch": 1.1756489881229113,
"grad_norm": 31.571155548095703,
"learning_rate": 2.1251761804087387e-06,
"loss": 0.4098,
"step": 6880
},
{
"epoch": 1.1773576358672977,
"grad_norm": 39.88226318359375,
"learning_rate": 2.120771670190275e-06,
"loss": 0.3931,
"step": 6890
},
{
"epoch": 1.179066283611684,
"grad_norm": 36.098209381103516,
"learning_rate": 2.1163671599718112e-06,
"loss": 0.3719,
"step": 6900
},
{
"epoch": 1.1807749313560703,
"grad_norm": 27.275989532470703,
"learning_rate": 2.1119626497533475e-06,
"loss": 0.4229,
"step": 6910
},
{
"epoch": 1.1824835791004566,
"grad_norm": 33.59117126464844,
"learning_rate": 2.107558139534884e-06,
"loss": 0.4148,
"step": 6920
},
{
"epoch": 1.1841922268448428,
"grad_norm": 34.12028884887695,
"learning_rate": 2.10315362931642e-06,
"loss": 0.4032,
"step": 6930
},
{
"epoch": 1.1859008745892292,
"grad_norm": 30.586210250854492,
"learning_rate": 2.0987491190979563e-06,
"loss": 0.3997,
"step": 6940
},
{
"epoch": 1.1876095223336154,
"grad_norm": 37.81381607055664,
"learning_rate": 2.0943446088794926e-06,
"loss": 0.355,
"step": 6950
},
{
"epoch": 1.1893181700780018,
"grad_norm": 24.5543270111084,
"learning_rate": 2.0899400986610292e-06,
"loss": 0.4053,
"step": 6960
},
{
"epoch": 1.191026817822388,
"grad_norm": 30.73529052734375,
"learning_rate": 2.0855355884425655e-06,
"loss": 0.366,
"step": 6970
},
{
"epoch": 1.1927354655667743,
"grad_norm": 39.325965881347656,
"learning_rate": 2.0811310782241018e-06,
"loss": 0.4275,
"step": 6980
},
{
"epoch": 1.1944441133111607,
"grad_norm": 23.889474868774414,
"learning_rate": 2.0767265680056376e-06,
"loss": 0.3998,
"step": 6990
},
{
"epoch": 1.196152761055547,
"grad_norm": 40.00243377685547,
"learning_rate": 2.0723220577871743e-06,
"loss": 0.4105,
"step": 7000
},
{
"epoch": 1.1978614087999333,
"grad_norm": 29.528654098510742,
"learning_rate": 2.0679175475687106e-06,
"loss": 0.3773,
"step": 7010
},
{
"epoch": 1.1995700565443195,
"grad_norm": 36.32196044921875,
"learning_rate": 2.063513037350247e-06,
"loss": 0.3841,
"step": 7020
},
{
"epoch": 1.2012787042887059,
"grad_norm": 27.748289108276367,
"learning_rate": 2.059108527131783e-06,
"loss": 0.4538,
"step": 7030
},
{
"epoch": 1.202987352033092,
"grad_norm": 28.619266510009766,
"learning_rate": 2.0547040169133194e-06,
"loss": 0.4644,
"step": 7040
},
{
"epoch": 1.2046959997774784,
"grad_norm": 35.11726379394531,
"learning_rate": 2.0502995066948556e-06,
"loss": 0.4483,
"step": 7050
},
{
"epoch": 1.2064046475218648,
"grad_norm": 26.959434509277344,
"learning_rate": 2.045894996476392e-06,
"loss": 0.4232,
"step": 7060
},
{
"epoch": 1.208113295266251,
"grad_norm": 28.872108459472656,
"learning_rate": 2.041490486257928e-06,
"loss": 0.4432,
"step": 7070
},
{
"epoch": 1.2098219430106374,
"grad_norm": 28.600481033325195,
"learning_rate": 2.0370859760394644e-06,
"loss": 0.4602,
"step": 7080
},
{
"epoch": 1.2115305907550236,
"grad_norm": 29.9169864654541,
"learning_rate": 2.0326814658210007e-06,
"loss": 0.4027,
"step": 7090
},
{
"epoch": 1.21323923849941,
"grad_norm": 25.34281349182129,
"learning_rate": 2.0282769556025374e-06,
"loss": 0.3764,
"step": 7100
},
{
"epoch": 1.2149478862437961,
"grad_norm": 26.075227737426758,
"learning_rate": 2.0238724453840736e-06,
"loss": 0.4072,
"step": 7110
},
{
"epoch": 1.2166565339881825,
"grad_norm": 36.243865966796875,
"learning_rate": 2.0194679351656095e-06,
"loss": 0.4358,
"step": 7120
},
{
"epoch": 1.218365181732569,
"grad_norm": 30.791261672973633,
"learning_rate": 2.0150634249471458e-06,
"loss": 0.4268,
"step": 7130
},
{
"epoch": 1.220073829476955,
"grad_norm": 31.967105865478516,
"learning_rate": 2.0106589147286824e-06,
"loss": 0.4535,
"step": 7140
},
{
"epoch": 1.2217824772213415,
"grad_norm": 22.780460357666016,
"learning_rate": 2.0062544045102187e-06,
"loss": 0.3439,
"step": 7150
},
{
"epoch": 1.2234911249657276,
"grad_norm": 24.721939086914062,
"learning_rate": 2.001849894291755e-06,
"loss": 0.3946,
"step": 7160
},
{
"epoch": 1.225199772710114,
"grad_norm": 31.781126022338867,
"learning_rate": 1.9974453840732912e-06,
"loss": 0.4221,
"step": 7170
},
{
"epoch": 1.2269084204545002,
"grad_norm": 39.08473587036133,
"learning_rate": 1.9930408738548275e-06,
"loss": 0.4108,
"step": 7180
},
{
"epoch": 1.2286170681988866,
"grad_norm": 32.67459487915039,
"learning_rate": 1.9886363636363638e-06,
"loss": 0.4214,
"step": 7190
},
{
"epoch": 1.2303257159432728,
"grad_norm": 36.37043762207031,
"learning_rate": 1.9842318534179e-06,
"loss": 0.3654,
"step": 7200
},
{
"epoch": 1.2320343636876592,
"grad_norm": 30.632551193237305,
"learning_rate": 1.9798273431994363e-06,
"loss": 0.3435,
"step": 7210
},
{
"epoch": 1.2337430114320456,
"grad_norm": 27.24967384338379,
"learning_rate": 1.9754228329809726e-06,
"loss": 0.3853,
"step": 7220
},
{
"epoch": 1.2354516591764317,
"grad_norm": 34.78539276123047,
"learning_rate": 1.971018322762509e-06,
"loss": 0.4485,
"step": 7230
},
{
"epoch": 1.2371603069208181,
"grad_norm": 30.56952476501465,
"learning_rate": 1.9666138125440455e-06,
"loss": 0.4489,
"step": 7240
},
{
"epoch": 1.2388689546652043,
"grad_norm": 25.958833694458008,
"learning_rate": 1.9622093023255814e-06,
"loss": 0.3734,
"step": 7250
},
{
"epoch": 1.2405776024095907,
"grad_norm": 21.95493507385254,
"learning_rate": 1.9578047921071176e-06,
"loss": 0.3972,
"step": 7260
},
{
"epoch": 1.242286250153977,
"grad_norm": 30.268014907836914,
"learning_rate": 1.9534002818886543e-06,
"loss": 0.3786,
"step": 7270
},
{
"epoch": 1.2439948978983633,
"grad_norm": 38.55772399902344,
"learning_rate": 1.9489957716701906e-06,
"loss": 0.4305,
"step": 7280
},
{
"epoch": 1.2457035456427497,
"grad_norm": 34.68306350708008,
"learning_rate": 1.944591261451727e-06,
"loss": 0.4009,
"step": 7290
},
{
"epoch": 1.2474121933871358,
"grad_norm": 31.926652908325195,
"learning_rate": 1.940186751233263e-06,
"loss": 0.3752,
"step": 7300
},
{
"epoch": 1.2491208411315222,
"grad_norm": 25.892805099487305,
"learning_rate": 1.9357822410147994e-06,
"loss": 0.3809,
"step": 7310
},
{
"epoch": 1.2508294888759084,
"grad_norm": 34.08556365966797,
"learning_rate": 1.9313777307963356e-06,
"loss": 0.4777,
"step": 7320
},
{
"epoch": 1.2525381366202948,
"grad_norm": 22.77074432373047,
"learning_rate": 1.926973220577872e-06,
"loss": 0.4021,
"step": 7330
},
{
"epoch": 1.254246784364681,
"grad_norm": 40.69630432128906,
"learning_rate": 1.922568710359408e-06,
"loss": 0.4507,
"step": 7340
},
{
"epoch": 1.2559554321090673,
"grad_norm": 26.732057571411133,
"learning_rate": 1.9181642001409444e-06,
"loss": 0.3927,
"step": 7350
},
{
"epoch": 1.2576640798534537,
"grad_norm": 27.998336791992188,
"learning_rate": 1.9137596899224807e-06,
"loss": 0.4073,
"step": 7360
},
{
"epoch": 1.25937272759784,
"grad_norm": 29.810136795043945,
"learning_rate": 1.9093551797040174e-06,
"loss": 0.4041,
"step": 7370
},
{
"epoch": 1.2610813753422263,
"grad_norm": 26.727005004882812,
"learning_rate": 1.9049506694855532e-06,
"loss": 0.3886,
"step": 7380
},
{
"epoch": 1.2627900230866125,
"grad_norm": 36.07413101196289,
"learning_rate": 1.9005461592670895e-06,
"loss": 0.3573,
"step": 7390
},
{
"epoch": 1.2644986708309989,
"grad_norm": 32.144283294677734,
"learning_rate": 1.896141649048626e-06,
"loss": 0.4074,
"step": 7400
},
{
"epoch": 1.2662073185753853,
"grad_norm": 24.47068977355957,
"learning_rate": 1.8917371388301622e-06,
"loss": 0.3934,
"step": 7410
},
{
"epoch": 1.2679159663197714,
"grad_norm": 29.83626365661621,
"learning_rate": 1.8873326286116985e-06,
"loss": 0.4235,
"step": 7420
},
{
"epoch": 1.2696246140641576,
"grad_norm": 27.749542236328125,
"learning_rate": 1.882928118393235e-06,
"loss": 0.3754,
"step": 7430
},
{
"epoch": 1.271333261808544,
"grad_norm": 25.998891830444336,
"learning_rate": 1.8785236081747712e-06,
"loss": 0.3815,
"step": 7440
},
{
"epoch": 1.2730419095529304,
"grad_norm": 32.17466735839844,
"learning_rate": 1.8741190979563073e-06,
"loss": 0.4338,
"step": 7450
},
{
"epoch": 1.2747505572973166,
"grad_norm": 28.768695831298828,
"learning_rate": 1.8697145877378436e-06,
"loss": 0.4261,
"step": 7460
},
{
"epoch": 1.276459205041703,
"grad_norm": 29.64584732055664,
"learning_rate": 1.86531007751938e-06,
"loss": 0.3678,
"step": 7470
},
{
"epoch": 1.2781678527860891,
"grad_norm": 32.0334587097168,
"learning_rate": 1.8609055673009163e-06,
"loss": 0.3834,
"step": 7480
},
{
"epoch": 1.2798765005304755,
"grad_norm": 33.2336540222168,
"learning_rate": 1.8565010570824526e-06,
"loss": 0.4092,
"step": 7490
},
{
"epoch": 1.281585148274862,
"grad_norm": 27.663143157958984,
"learning_rate": 1.852096546863989e-06,
"loss": 0.453,
"step": 7500
},
{
"epoch": 1.283293796019248,
"grad_norm": 26.34569549560547,
"learning_rate": 1.847692036645525e-06,
"loss": 0.3915,
"step": 7510
},
{
"epoch": 1.2850024437636345,
"grad_norm": 30.302072525024414,
"learning_rate": 1.8432875264270614e-06,
"loss": 0.4404,
"step": 7520
},
{
"epoch": 1.2867110915080207,
"grad_norm": 29.25191879272461,
"learning_rate": 1.8388830162085976e-06,
"loss": 0.3705,
"step": 7530
},
{
"epoch": 1.288419739252407,
"grad_norm": 25.125303268432617,
"learning_rate": 1.8344785059901341e-06,
"loss": 0.4077,
"step": 7540
},
{
"epoch": 1.2901283869967934,
"grad_norm": 36.632869720458984,
"learning_rate": 1.8300739957716704e-06,
"loss": 0.4101,
"step": 7550
},
{
"epoch": 1.2918370347411796,
"grad_norm": 34.67438507080078,
"learning_rate": 1.8256694855532066e-06,
"loss": 0.4094,
"step": 7560
},
{
"epoch": 1.2935456824855658,
"grad_norm": 38.920654296875,
"learning_rate": 1.8212649753347431e-06,
"loss": 0.4094,
"step": 7570
},
{
"epoch": 1.2952543302299522,
"grad_norm": 27.154075622558594,
"learning_rate": 1.8168604651162792e-06,
"loss": 0.3877,
"step": 7580
},
{
"epoch": 1.2969629779743386,
"grad_norm": 28.526582717895508,
"learning_rate": 1.8124559548978154e-06,
"loss": 0.4418,
"step": 7590
},
{
"epoch": 1.2986716257187247,
"grad_norm": 23.52269172668457,
"learning_rate": 1.8080514446793517e-06,
"loss": 0.3878,
"step": 7600
},
{
"epoch": 1.3003802734631111,
"grad_norm": 24.462650299072266,
"learning_rate": 1.8036469344608882e-06,
"loss": 0.3884,
"step": 7610
},
{
"epoch": 1.3020889212074973,
"grad_norm": 28.307111740112305,
"learning_rate": 1.7992424242424244e-06,
"loss": 0.3681,
"step": 7620
},
{
"epoch": 1.3037975689518837,
"grad_norm": 27.19947624206543,
"learning_rate": 1.7948379140239607e-06,
"loss": 0.3905,
"step": 7630
},
{
"epoch": 1.30550621669627,
"grad_norm": 40.48936080932617,
"learning_rate": 1.7904334038054968e-06,
"loss": 0.414,
"step": 7640
},
{
"epoch": 1.3072148644406563,
"grad_norm": 30.751718521118164,
"learning_rate": 1.7860288935870332e-06,
"loss": 0.4488,
"step": 7650
},
{
"epoch": 1.3089235121850427,
"grad_norm": 32.26466369628906,
"learning_rate": 1.7816243833685695e-06,
"loss": 0.4015,
"step": 7660
},
{
"epoch": 1.3106321599294288,
"grad_norm": 32.198055267333984,
"learning_rate": 1.7772198731501058e-06,
"loss": 0.3984,
"step": 7670
},
{
"epoch": 1.3123408076738152,
"grad_norm": 26.396881103515625,
"learning_rate": 1.7728153629316422e-06,
"loss": 0.3793,
"step": 7680
},
{
"epoch": 1.3140494554182014,
"grad_norm": 37.478797912597656,
"learning_rate": 1.7684108527131785e-06,
"loss": 0.3862,
"step": 7690
},
{
"epoch": 1.3157581031625878,
"grad_norm": 40.08991622924805,
"learning_rate": 1.7640063424947148e-06,
"loss": 0.4314,
"step": 7700
},
{
"epoch": 1.317466750906974,
"grad_norm": 33.823116302490234,
"learning_rate": 1.7596018322762508e-06,
"loss": 0.3761,
"step": 7710
},
{
"epoch": 1.3191753986513604,
"grad_norm": 30.793943405151367,
"learning_rate": 1.7551973220577873e-06,
"loss": 0.4022,
"step": 7720
},
{
"epoch": 1.3208840463957467,
"grad_norm": 29.607755661010742,
"learning_rate": 1.7507928118393236e-06,
"loss": 0.4276,
"step": 7730
},
{
"epoch": 1.322592694140133,
"grad_norm": 36.47589111328125,
"learning_rate": 1.7463883016208598e-06,
"loss": 0.3849,
"step": 7740
},
{
"epoch": 1.3243013418845193,
"grad_norm": 29.826234817504883,
"learning_rate": 1.7419837914023963e-06,
"loss": 0.4171,
"step": 7750
},
{
"epoch": 1.3260099896289055,
"grad_norm": 30.34208106994629,
"learning_rate": 1.7375792811839326e-06,
"loss": 0.4239,
"step": 7760
},
{
"epoch": 1.3277186373732919,
"grad_norm": 32.37610626220703,
"learning_rate": 1.7331747709654686e-06,
"loss": 0.4417,
"step": 7770
},
{
"epoch": 1.3294272851176783,
"grad_norm": 29.77751922607422,
"learning_rate": 1.728770260747005e-06,
"loss": 0.3529,
"step": 7780
},
{
"epoch": 1.3311359328620644,
"grad_norm": 27.710689544677734,
"learning_rate": 1.7243657505285414e-06,
"loss": 0.396,
"step": 7790
},
{
"epoch": 1.3328445806064506,
"grad_norm": 35.94044876098633,
"learning_rate": 1.7199612403100776e-06,
"loss": 0.4285,
"step": 7800
},
{
"epoch": 1.334553228350837,
"grad_norm": 31.398242950439453,
"learning_rate": 1.715556730091614e-06,
"loss": 0.3621,
"step": 7810
},
{
"epoch": 1.3362618760952234,
"grad_norm": 22.245264053344727,
"learning_rate": 1.7111522198731504e-06,
"loss": 0.4047,
"step": 7820
},
{
"epoch": 1.3379705238396096,
"grad_norm": 25.29467010498047,
"learning_rate": 1.7067477096546866e-06,
"loss": 0.4099,
"step": 7830
},
{
"epoch": 1.339679171583996,
"grad_norm": 28.86480140686035,
"learning_rate": 1.7023431994362227e-06,
"loss": 0.39,
"step": 7840
},
{
"epoch": 1.3413878193283821,
"grad_norm": 32.23060607910156,
"learning_rate": 1.697938689217759e-06,
"loss": 0.4299,
"step": 7850
},
{
"epoch": 1.3430964670727685,
"grad_norm": 37.11185836791992,
"learning_rate": 1.6935341789992954e-06,
"loss": 0.3409,
"step": 7860
},
{
"epoch": 1.344805114817155,
"grad_norm": 31.354124069213867,
"learning_rate": 1.6891296687808317e-06,
"loss": 0.4066,
"step": 7870
},
{
"epoch": 1.346513762561541,
"grad_norm": 29.411638259887695,
"learning_rate": 1.684725158562368e-06,
"loss": 0.4163,
"step": 7880
},
{
"epoch": 1.3482224103059275,
"grad_norm": 29.95796775817871,
"learning_rate": 1.6803206483439045e-06,
"loss": 0.413,
"step": 7890
},
{
"epoch": 1.3499310580503137,
"grad_norm": 26.26283073425293,
"learning_rate": 1.6759161381254405e-06,
"loss": 0.4122,
"step": 7900
},
{
"epoch": 1.3516397057947,
"grad_norm": 23.130903244018555,
"learning_rate": 1.6715116279069768e-06,
"loss": 0.414,
"step": 7910
},
{
"epoch": 1.3533483535390864,
"grad_norm": 33.57529830932617,
"learning_rate": 1.667107117688513e-06,
"loss": 0.4446,
"step": 7920
},
{
"epoch": 1.3550570012834726,
"grad_norm": 27.545856475830078,
"learning_rate": 1.6627026074700495e-06,
"loss": 0.3671,
"step": 7930
},
{
"epoch": 1.3567656490278588,
"grad_norm": 28.595279693603516,
"learning_rate": 1.6582980972515858e-06,
"loss": 0.3938,
"step": 7940
},
{
"epoch": 1.3584742967722452,
"grad_norm": 34.10601806640625,
"learning_rate": 1.653893587033122e-06,
"loss": 0.4092,
"step": 7950
},
{
"epoch": 1.3601829445166316,
"grad_norm": 36.68281936645508,
"learning_rate": 1.6494890768146585e-06,
"loss": 0.4359,
"step": 7960
},
{
"epoch": 1.3618915922610177,
"grad_norm": 36.802757263183594,
"learning_rate": 1.6450845665961946e-06,
"loss": 0.3926,
"step": 7970
},
{
"epoch": 1.3636002400054041,
"grad_norm": 38.538978576660156,
"learning_rate": 1.6406800563777308e-06,
"loss": 0.4068,
"step": 7980
},
{
"epoch": 1.3653088877497903,
"grad_norm": 24.058565139770508,
"learning_rate": 1.636275546159267e-06,
"loss": 0.3744,
"step": 7990
},
{
"epoch": 1.3670175354941767,
"grad_norm": 25.09589385986328,
"learning_rate": 1.6318710359408036e-06,
"loss": 0.4155,
"step": 8000
},
{
"epoch": 1.368726183238563,
"grad_norm": 35.97821044921875,
"learning_rate": 1.6274665257223398e-06,
"loss": 0.4474,
"step": 8010
},
{
"epoch": 1.3704348309829493,
"grad_norm": 30.579835891723633,
"learning_rate": 1.6230620155038761e-06,
"loss": 0.3728,
"step": 8020
},
{
"epoch": 1.3721434787273357,
"grad_norm": 24.492128372192383,
"learning_rate": 1.6186575052854122e-06,
"loss": 0.4528,
"step": 8030
},
{
"epoch": 1.3738521264717218,
"grad_norm": 29.143388748168945,
"learning_rate": 1.6142529950669486e-06,
"loss": 0.3942,
"step": 8040
},
{
"epoch": 1.3755607742161082,
"grad_norm": 32.478759765625,
"learning_rate": 1.609848484848485e-06,
"loss": 0.3717,
"step": 8050
},
{
"epoch": 1.3772694219604944,
"grad_norm": 30.298538208007812,
"learning_rate": 1.6054439746300212e-06,
"loss": 0.4021,
"step": 8060
},
{
"epoch": 1.3789780697048808,
"grad_norm": 30.066699981689453,
"learning_rate": 1.6010394644115576e-06,
"loss": 0.3804,
"step": 8070
},
{
"epoch": 1.380686717449267,
"grad_norm": 35.945133209228516,
"learning_rate": 1.596634954193094e-06,
"loss": 0.4372,
"step": 8080
},
{
"epoch": 1.3823953651936534,
"grad_norm": 21.04485321044922,
"learning_rate": 1.5922304439746302e-06,
"loss": 0.3724,
"step": 8090
},
{
"epoch": 1.3841040129380398,
"grad_norm": 27.374027252197266,
"learning_rate": 1.5878259337561662e-06,
"loss": 0.3829,
"step": 8100
},
{
"epoch": 1.385812660682426,
"grad_norm": 27.289045333862305,
"learning_rate": 1.5834214235377027e-06,
"loss": 0.3279,
"step": 8110
},
{
"epoch": 1.3875213084268123,
"grad_norm": 19.139402389526367,
"learning_rate": 1.579016913319239e-06,
"loss": 0.3983,
"step": 8120
},
{
"epoch": 1.3892299561711985,
"grad_norm": 31.3995418548584,
"learning_rate": 1.5746124031007752e-06,
"loss": 0.4445,
"step": 8130
},
{
"epoch": 1.3909386039155849,
"grad_norm": 23.96241569519043,
"learning_rate": 1.5702078928823117e-06,
"loss": 0.4473,
"step": 8140
},
{
"epoch": 1.3926472516599713,
"grad_norm": 37.16488265991211,
"learning_rate": 1.565803382663848e-06,
"loss": 0.3865,
"step": 8150
},
{
"epoch": 1.3943558994043574,
"grad_norm": 31.697296142578125,
"learning_rate": 1.561398872445384e-06,
"loss": 0.371,
"step": 8160
},
{
"epoch": 1.3960645471487436,
"grad_norm": 24.636869430541992,
"learning_rate": 1.5569943622269205e-06,
"loss": 0.3392,
"step": 8170
},
{
"epoch": 1.39777319489313,
"grad_norm": 36.1915168762207,
"learning_rate": 1.5525898520084568e-06,
"loss": 0.3992,
"step": 8180
},
{
"epoch": 1.3994818426375164,
"grad_norm": 25.10267448425293,
"learning_rate": 1.548185341789993e-06,
"loss": 0.4454,
"step": 8190
},
{
"epoch": 1.4011904903819026,
"grad_norm": 27.928958892822266,
"learning_rate": 1.5437808315715295e-06,
"loss": 0.3802,
"step": 8200
},
{
"epoch": 1.402899138126289,
"grad_norm": 27.088727951049805,
"learning_rate": 1.5393763213530658e-06,
"loss": 0.3792,
"step": 8210
},
{
"epoch": 1.4046077858706751,
"grad_norm": 28.89666175842285,
"learning_rate": 1.534971811134602e-06,
"loss": 0.3851,
"step": 8220
},
{
"epoch": 1.4063164336150615,
"grad_norm": 35.841854095458984,
"learning_rate": 1.530567300916138e-06,
"loss": 0.4547,
"step": 8230
},
{
"epoch": 1.408025081359448,
"grad_norm": 32.671783447265625,
"learning_rate": 1.5261627906976746e-06,
"loss": 0.3867,
"step": 8240
},
{
"epoch": 1.409733729103834,
"grad_norm": 26.516185760498047,
"learning_rate": 1.5217582804792108e-06,
"loss": 0.4012,
"step": 8250
},
{
"epoch": 1.4114423768482205,
"grad_norm": 37.364967346191406,
"learning_rate": 1.5173537702607471e-06,
"loss": 0.365,
"step": 8260
},
{
"epoch": 1.4131510245926067,
"grad_norm": 27.502492904663086,
"learning_rate": 1.5129492600422836e-06,
"loss": 0.3835,
"step": 8270
},
{
"epoch": 1.414859672336993,
"grad_norm": 30.40472412109375,
"learning_rate": 1.5085447498238199e-06,
"loss": 0.3815,
"step": 8280
},
{
"epoch": 1.4165683200813795,
"grad_norm": 24.262475967407227,
"learning_rate": 1.504140239605356e-06,
"loss": 0.4278,
"step": 8290
},
{
"epoch": 1.4182769678257656,
"grad_norm": 31.887592315673828,
"learning_rate": 1.4997357293868922e-06,
"loss": 0.4523,
"step": 8300
},
{
"epoch": 1.4199856155701518,
"grad_norm": 25.971759796142578,
"learning_rate": 1.4953312191684286e-06,
"loss": 0.4228,
"step": 8310
},
{
"epoch": 1.4216942633145382,
"grad_norm": 24.0732364654541,
"learning_rate": 1.490926708949965e-06,
"loss": 0.4284,
"step": 8320
},
{
"epoch": 1.4234029110589246,
"grad_norm": 35.71511459350586,
"learning_rate": 1.4865221987315012e-06,
"loss": 0.3756,
"step": 8330
},
{
"epoch": 1.4251115588033108,
"grad_norm": 25.345888137817383,
"learning_rate": 1.4821176885130377e-06,
"loss": 0.4182,
"step": 8340
},
{
"epoch": 1.4268202065476971,
"grad_norm": 31.115188598632812,
"learning_rate": 1.477713178294574e-06,
"loss": 0.4363,
"step": 8350
},
{
"epoch": 1.4285288542920833,
"grad_norm": 35.88517761230469,
"learning_rate": 1.47330866807611e-06,
"loss": 0.4125,
"step": 8360
},
{
"epoch": 1.4302375020364697,
"grad_norm": 30.74094581604004,
"learning_rate": 1.4689041578576462e-06,
"loss": 0.3921,
"step": 8370
},
{
"epoch": 1.431946149780856,
"grad_norm": 30.39889144897461,
"learning_rate": 1.4644996476391827e-06,
"loss": 0.4258,
"step": 8380
},
{
"epoch": 1.4336547975252423,
"grad_norm": 30.968448638916016,
"learning_rate": 1.460095137420719e-06,
"loss": 0.3653,
"step": 8390
},
{
"epoch": 1.4353634452696287,
"grad_norm": 29.428611755371094,
"learning_rate": 1.4556906272022552e-06,
"loss": 0.402,
"step": 8400
},
{
"epoch": 1.4370720930140148,
"grad_norm": 29.114940643310547,
"learning_rate": 1.4512861169837917e-06,
"loss": 0.3934,
"step": 8410
},
{
"epoch": 1.4387807407584012,
"grad_norm": 32.88404083251953,
"learning_rate": 1.4468816067653278e-06,
"loss": 0.401,
"step": 8420
},
{
"epoch": 1.4404893885027874,
"grad_norm": 32.356021881103516,
"learning_rate": 1.442477096546864e-06,
"loss": 0.3543,
"step": 8430
},
{
"epoch": 1.4421980362471738,
"grad_norm": 33.27191925048828,
"learning_rate": 1.4380725863284003e-06,
"loss": 0.4492,
"step": 8440
},
{
"epoch": 1.44390668399156,
"grad_norm": 33.288536071777344,
"learning_rate": 1.4336680761099368e-06,
"loss": 0.3954,
"step": 8450
},
{
"epoch": 1.4456153317359464,
"grad_norm": 30.489593505859375,
"learning_rate": 1.429263565891473e-06,
"loss": 0.4248,
"step": 8460
},
{
"epoch": 1.4473239794803328,
"grad_norm": 38.16218566894531,
"learning_rate": 1.4248590556730093e-06,
"loss": 0.4624,
"step": 8470
},
{
"epoch": 1.449032627224719,
"grad_norm": 25.624847412109375,
"learning_rate": 1.4204545454545458e-06,
"loss": 0.4115,
"step": 8480
},
{
"epoch": 1.4507412749691053,
"grad_norm": 34.9322395324707,
"learning_rate": 1.4160500352360818e-06,
"loss": 0.4179,
"step": 8490
},
{
"epoch": 1.4524499227134915,
"grad_norm": 31.277803421020508,
"learning_rate": 1.4116455250176181e-06,
"loss": 0.3662,
"step": 8500
},
{
"epoch": 1.4541585704578779,
"grad_norm": 32.513633728027344,
"learning_rate": 1.4072410147991544e-06,
"loss": 0.4179,
"step": 8510
},
{
"epoch": 1.4558672182022643,
"grad_norm": 31.79774284362793,
"learning_rate": 1.4028365045806909e-06,
"loss": 0.4221,
"step": 8520
},
{
"epoch": 1.4575758659466505,
"grad_norm": 35.4056282043457,
"learning_rate": 1.3984319943622271e-06,
"loss": 0.4233,
"step": 8530
},
{
"epoch": 1.4592845136910366,
"grad_norm": 32.08757019042969,
"learning_rate": 1.3940274841437634e-06,
"loss": 0.4619,
"step": 8540
},
{
"epoch": 1.460993161435423,
"grad_norm": 31.21336555480957,
"learning_rate": 1.3896229739252994e-06,
"loss": 0.3375,
"step": 8550
},
{
"epoch": 1.4627018091798094,
"grad_norm": 31.44502067565918,
"learning_rate": 1.385218463706836e-06,
"loss": 0.4121,
"step": 8560
},
{
"epoch": 1.4644104569241956,
"grad_norm": 21.62190818786621,
"learning_rate": 1.3808139534883722e-06,
"loss": 0.3727,
"step": 8570
},
{
"epoch": 1.466119104668582,
"grad_norm": 33.74460983276367,
"learning_rate": 1.3764094432699084e-06,
"loss": 0.426,
"step": 8580
},
{
"epoch": 1.4678277524129681,
"grad_norm": 22.65791130065918,
"learning_rate": 1.372004933051445e-06,
"loss": 0.3669,
"step": 8590
},
{
"epoch": 1.4695364001573545,
"grad_norm": 38.821624755859375,
"learning_rate": 1.3676004228329812e-06,
"loss": 0.3906,
"step": 8600
},
{
"epoch": 1.471245047901741,
"grad_norm": 38.148475646972656,
"learning_rate": 1.3631959126145175e-06,
"loss": 0.4369,
"step": 8610
},
{
"epoch": 1.472953695646127,
"grad_norm": 25.316579818725586,
"learning_rate": 1.3587914023960535e-06,
"loss": 0.3884,
"step": 8620
},
{
"epoch": 1.4746623433905135,
"grad_norm": 40.01092529296875,
"learning_rate": 1.35438689217759e-06,
"loss": 0.4031,
"step": 8630
},
{
"epoch": 1.4763709911348997,
"grad_norm": 23.749156951904297,
"learning_rate": 1.3499823819591262e-06,
"loss": 0.4174,
"step": 8640
},
{
"epoch": 1.478079638879286,
"grad_norm": 25.226078033447266,
"learning_rate": 1.3455778717406625e-06,
"loss": 0.3646,
"step": 8650
},
{
"epoch": 1.4797882866236725,
"grad_norm": 32.117034912109375,
"learning_rate": 1.341173361522199e-06,
"loss": 0.409,
"step": 8660
},
{
"epoch": 1.4814969343680586,
"grad_norm": 27.94634437561035,
"learning_rate": 1.3367688513037353e-06,
"loss": 0.4158,
"step": 8670
},
{
"epoch": 1.4832055821124448,
"grad_norm": 27.515697479248047,
"learning_rate": 1.3323643410852713e-06,
"loss": 0.3976,
"step": 8680
},
{
"epoch": 1.4849142298568312,
"grad_norm": 41.739105224609375,
"learning_rate": 1.3279598308668076e-06,
"loss": 0.337,
"step": 8690
},
{
"epoch": 1.4866228776012176,
"grad_norm": 37.13324737548828,
"learning_rate": 1.323555320648344e-06,
"loss": 0.381,
"step": 8700
},
{
"epoch": 1.4883315253456038,
"grad_norm": 23.801599502563477,
"learning_rate": 1.3191508104298803e-06,
"loss": 0.4244,
"step": 8710
},
{
"epoch": 1.4900401730899901,
"grad_norm": 28.293941497802734,
"learning_rate": 1.3147463002114166e-06,
"loss": 0.3995,
"step": 8720
},
{
"epoch": 1.4917488208343763,
"grad_norm": 23.51873779296875,
"learning_rate": 1.310341789992953e-06,
"loss": 0.3845,
"step": 8730
},
{
"epoch": 1.4934574685787627,
"grad_norm": 25.12767219543457,
"learning_rate": 1.3059372797744893e-06,
"loss": 0.3522,
"step": 8740
},
{
"epoch": 1.495166116323149,
"grad_norm": 21.655824661254883,
"learning_rate": 1.3015327695560254e-06,
"loss": 0.3776,
"step": 8750
},
{
"epoch": 1.4968747640675353,
"grad_norm": 32.18788146972656,
"learning_rate": 1.2971282593375616e-06,
"loss": 0.401,
"step": 8760
},
{
"epoch": 1.4985834118119217,
"grad_norm": 45.1816520690918,
"learning_rate": 1.2927237491190981e-06,
"loss": 0.4374,
"step": 8770
},
{
"epoch": 1.5002920595563078,
"grad_norm": 33.538047790527344,
"learning_rate": 1.2883192389006344e-06,
"loss": 0.4288,
"step": 8780
},
{
"epoch": 1.5020007073006942,
"grad_norm": 31.226816177368164,
"learning_rate": 1.2839147286821706e-06,
"loss": 0.3961,
"step": 8790
},
{
"epoch": 1.5037093550450806,
"grad_norm": 24.751720428466797,
"learning_rate": 1.2795102184637071e-06,
"loss": 0.351,
"step": 8800
},
{
"epoch": 1.5054180027894668,
"grad_norm": 35.17796325683594,
"learning_rate": 1.2751057082452432e-06,
"loss": 0.4673,
"step": 8810
},
{
"epoch": 1.507126650533853,
"grad_norm": 26.320959091186523,
"learning_rate": 1.2707011980267794e-06,
"loss": 0.4025,
"step": 8820
},
{
"epoch": 1.5088352982782394,
"grad_norm": 25.2487735748291,
"learning_rate": 1.2662966878083157e-06,
"loss": 0.3631,
"step": 8830
},
{
"epoch": 1.5105439460226258,
"grad_norm": 26.821157455444336,
"learning_rate": 1.2618921775898522e-06,
"loss": 0.3253,
"step": 8840
},
{
"epoch": 1.512252593767012,
"grad_norm": 20.908111572265625,
"learning_rate": 1.2574876673713885e-06,
"loss": 0.3598,
"step": 8850
},
{
"epoch": 1.513961241511398,
"grad_norm": 39.165706634521484,
"learning_rate": 1.2530831571529247e-06,
"loss": 0.4052,
"step": 8860
},
{
"epoch": 1.5156698892557845,
"grad_norm": 32.390995025634766,
"learning_rate": 1.248678646934461e-06,
"loss": 0.4037,
"step": 8870
},
{
"epoch": 1.517378537000171,
"grad_norm": 28.246858596801758,
"learning_rate": 1.2442741367159972e-06,
"loss": 0.3995,
"step": 8880
},
{
"epoch": 1.5190871847445573,
"grad_norm": 31.864625930786133,
"learning_rate": 1.2398696264975335e-06,
"loss": 0.3803,
"step": 8890
},
{
"epoch": 1.5207958324889435,
"grad_norm": 23.626855850219727,
"learning_rate": 1.2354651162790698e-06,
"loss": 0.3864,
"step": 8900
},
{
"epoch": 1.5225044802333296,
"grad_norm": 24.359804153442383,
"learning_rate": 1.2310606060606063e-06,
"loss": 0.3669,
"step": 8910
},
{
"epoch": 1.524213127977716,
"grad_norm": 27.360803604125977,
"learning_rate": 1.2266560958421425e-06,
"loss": 0.3916,
"step": 8920
},
{
"epoch": 1.5259217757221024,
"grad_norm": 27.511882781982422,
"learning_rate": 1.2222515856236788e-06,
"loss": 0.3745,
"step": 8930
},
{
"epoch": 1.5276304234664888,
"grad_norm": 26.44959831237793,
"learning_rate": 1.217847075405215e-06,
"loss": 0.3918,
"step": 8940
},
{
"epoch": 1.529339071210875,
"grad_norm": 29.03026008605957,
"learning_rate": 1.2134425651867513e-06,
"loss": 0.3829,
"step": 8950
},
{
"epoch": 1.5310477189552611,
"grad_norm": 31.914691925048828,
"learning_rate": 1.2090380549682876e-06,
"loss": 0.3987,
"step": 8960
},
{
"epoch": 1.5327563666996475,
"grad_norm": 39.63639831542969,
"learning_rate": 1.2046335447498238e-06,
"loss": 0.4399,
"step": 8970
},
{
"epoch": 1.534465014444034,
"grad_norm": 25.887651443481445,
"learning_rate": 1.2002290345313603e-06,
"loss": 0.4067,
"step": 8980
},
{
"epoch": 1.53617366218842,
"grad_norm": 30.17310333251953,
"learning_rate": 1.1958245243128964e-06,
"loss": 0.397,
"step": 8990
},
{
"epoch": 1.5378823099328063,
"grad_norm": 28.75864601135254,
"learning_rate": 1.1914200140944329e-06,
"loss": 0.4094,
"step": 9000
},
{
"epoch": 1.5395909576771927,
"grad_norm": 40.84502029418945,
"learning_rate": 1.1870155038759691e-06,
"loss": 0.35,
"step": 9010
},
{
"epoch": 1.541299605421579,
"grad_norm": 27.4794864654541,
"learning_rate": 1.1826109936575054e-06,
"loss": 0.4164,
"step": 9020
},
{
"epoch": 1.5430082531659655,
"grad_norm": 35.87556076049805,
"learning_rate": 1.1782064834390416e-06,
"loss": 0.3561,
"step": 9030
},
{
"epoch": 1.5447169009103516,
"grad_norm": 32.51176071166992,
"learning_rate": 1.173801973220578e-06,
"loss": 0.3813,
"step": 9040
},
{
"epoch": 1.5464255486547378,
"grad_norm": 34.02533721923828,
"learning_rate": 1.1693974630021144e-06,
"loss": 0.4186,
"step": 9050
},
{
"epoch": 1.5481341963991242,
"grad_norm": 25.257232666015625,
"learning_rate": 1.1649929527836504e-06,
"loss": 0.3428,
"step": 9060
},
{
"epoch": 1.5498428441435106,
"grad_norm": 27.12441635131836,
"learning_rate": 1.160588442565187e-06,
"loss": 0.3731,
"step": 9070
},
{
"epoch": 1.5515514918878968,
"grad_norm": 32.43393325805664,
"learning_rate": 1.1561839323467232e-06,
"loss": 0.3484,
"step": 9080
},
{
"epoch": 1.5532601396322832,
"grad_norm": 35.18085479736328,
"learning_rate": 1.1517794221282595e-06,
"loss": 0.3964,
"step": 9090
},
{
"epoch": 1.5549687873766693,
"grad_norm": 30.050132751464844,
"learning_rate": 1.1473749119097957e-06,
"loss": 0.3969,
"step": 9100
},
{
"epoch": 1.5566774351210557,
"grad_norm": 34.45301818847656,
"learning_rate": 1.142970401691332e-06,
"loss": 0.3945,
"step": 9110
},
{
"epoch": 1.5583860828654421,
"grad_norm": 31.908273696899414,
"learning_rate": 1.1385658914728682e-06,
"loss": 0.3823,
"step": 9120
},
{
"epoch": 1.5600947306098283,
"grad_norm": 26.37557601928711,
"learning_rate": 1.1341613812544045e-06,
"loss": 0.3638,
"step": 9130
},
{
"epoch": 1.5618033783542145,
"grad_norm": 25.550487518310547,
"learning_rate": 1.129756871035941e-06,
"loss": 0.3556,
"step": 9140
},
{
"epoch": 1.5635120260986008,
"grad_norm": 28.921995162963867,
"learning_rate": 1.1253523608174773e-06,
"loss": 0.4134,
"step": 9150
},
{
"epoch": 1.5652206738429872,
"grad_norm": 26.404720306396484,
"learning_rate": 1.1209478505990135e-06,
"loss": 0.3664,
"step": 9160
},
{
"epoch": 1.5669293215873736,
"grad_norm": 29.88231086730957,
"learning_rate": 1.1165433403805498e-06,
"loss": 0.3848,
"step": 9170
},
{
"epoch": 1.5686379693317598,
"grad_norm": 38.20869827270508,
"learning_rate": 1.112138830162086e-06,
"loss": 0.3823,
"step": 9180
},
{
"epoch": 1.570346617076146,
"grad_norm": 42.82072448730469,
"learning_rate": 1.1077343199436223e-06,
"loss": 0.372,
"step": 9190
},
{
"epoch": 1.5720552648205324,
"grad_norm": 27.147830963134766,
"learning_rate": 1.1033298097251586e-06,
"loss": 0.4172,
"step": 9200
},
{
"epoch": 1.5737639125649188,
"grad_norm": 32.74360656738281,
"learning_rate": 1.098925299506695e-06,
"loss": 0.3772,
"step": 9210
},
{
"epoch": 1.575472560309305,
"grad_norm": 23.909259796142578,
"learning_rate": 1.0945207892882311e-06,
"loss": 0.3215,
"step": 9220
},
{
"epoch": 1.5771812080536913,
"grad_norm": 32.20122146606445,
"learning_rate": 1.0901162790697676e-06,
"loss": 0.3864,
"step": 9230
},
{
"epoch": 1.5788898557980775,
"grad_norm": 29.837228775024414,
"learning_rate": 1.0857117688513039e-06,
"loss": 0.4026,
"step": 9240
},
{
"epoch": 1.580598503542464,
"grad_norm": 34.161033630371094,
"learning_rate": 1.0813072586328401e-06,
"loss": 0.4217,
"step": 9250
},
{
"epoch": 1.5823071512868503,
"grad_norm": 39.935638427734375,
"learning_rate": 1.0769027484143764e-06,
"loss": 0.3818,
"step": 9260
},
{
"epoch": 1.5840157990312365,
"grad_norm": 29.2546443939209,
"learning_rate": 1.0724982381959126e-06,
"loss": 0.406,
"step": 9270
},
{
"epoch": 1.5857244467756226,
"grad_norm": 33.643367767333984,
"learning_rate": 1.0680937279774491e-06,
"loss": 0.3627,
"step": 9280
},
{
"epoch": 1.587433094520009,
"grad_norm": 48.66536331176758,
"learning_rate": 1.0636892177589852e-06,
"loss": 0.3924,
"step": 9290
},
{
"epoch": 1.5891417422643954,
"grad_norm": 29.057153701782227,
"learning_rate": 1.0592847075405217e-06,
"loss": 0.3797,
"step": 9300
},
{
"epoch": 1.5908503900087818,
"grad_norm": 30.0162296295166,
"learning_rate": 1.054880197322058e-06,
"loss": 0.4337,
"step": 9310
},
{
"epoch": 1.592559037753168,
"grad_norm": 30.404836654663086,
"learning_rate": 1.0504756871035942e-06,
"loss": 0.3784,
"step": 9320
},
{
"epoch": 1.5942676854975542,
"grad_norm": 41.39947509765625,
"learning_rate": 1.0460711768851305e-06,
"loss": 0.3421,
"step": 9330
},
{
"epoch": 1.5959763332419405,
"grad_norm": 25.326269149780273,
"learning_rate": 1.0416666666666667e-06,
"loss": 0.3901,
"step": 9340
},
{
"epoch": 1.597684980986327,
"grad_norm": 29.41655731201172,
"learning_rate": 1.037262156448203e-06,
"loss": 0.3393,
"step": 9350
},
{
"epoch": 1.5993936287307131,
"grad_norm": 30.155683517456055,
"learning_rate": 1.0328576462297392e-06,
"loss": 0.3745,
"step": 9360
},
{
"epoch": 1.6011022764750993,
"grad_norm": 27.556821823120117,
"learning_rate": 1.0284531360112757e-06,
"loss": 0.4145,
"step": 9370
},
{
"epoch": 1.6028109242194857,
"grad_norm": 39.412540435791016,
"learning_rate": 1.0240486257928118e-06,
"loss": 0.3815,
"step": 9380
},
{
"epoch": 1.604519571963872,
"grad_norm": 30.376188278198242,
"learning_rate": 1.0196441155743483e-06,
"loss": 0.3812,
"step": 9390
},
{
"epoch": 1.6062282197082585,
"grad_norm": 35.539546966552734,
"learning_rate": 1.0152396053558845e-06,
"loss": 0.3949,
"step": 9400
},
{
"epoch": 1.6079368674526446,
"grad_norm": 27.052183151245117,
"learning_rate": 1.0108350951374208e-06,
"loss": 0.4059,
"step": 9410
},
{
"epoch": 1.6096455151970308,
"grad_norm": 22.53864860534668,
"learning_rate": 1.006430584918957e-06,
"loss": 0.3381,
"step": 9420
},
{
"epoch": 1.6113541629414172,
"grad_norm": 33.662052154541016,
"learning_rate": 1.0020260747004933e-06,
"loss": 0.3852,
"step": 9430
},
{
"epoch": 1.6130628106858036,
"grad_norm": 35.129295349121094,
"learning_rate": 9.976215644820298e-07,
"loss": 0.3737,
"step": 9440
},
{
"epoch": 1.6147714584301898,
"grad_norm": 14.55792236328125,
"learning_rate": 9.932170542635658e-07,
"loss": 0.3682,
"step": 9450
},
{
"epoch": 1.6164801061745762,
"grad_norm": 34.31297302246094,
"learning_rate": 9.888125440451023e-07,
"loss": 0.394,
"step": 9460
},
{
"epoch": 1.6181887539189623,
"grad_norm": 28.12514305114746,
"learning_rate": 9.844080338266386e-07,
"loss": 0.4253,
"step": 9470
},
{
"epoch": 1.6198974016633487,
"grad_norm": 31.71592140197754,
"learning_rate": 9.800035236081749e-07,
"loss": 0.3742,
"step": 9480
},
{
"epoch": 1.6216060494077351,
"grad_norm": 33.897281646728516,
"learning_rate": 9.755990133897111e-07,
"loss": 0.3855,
"step": 9490
},
{
"epoch": 1.6233146971521213,
"grad_norm": 26.927099227905273,
"learning_rate": 9.711945031712474e-07,
"loss": 0.4042,
"step": 9500
},
{
"epoch": 1.6250233448965075,
"grad_norm": 31.36831283569336,
"learning_rate": 9.667899929527836e-07,
"loss": 0.4033,
"step": 9510
},
{
"epoch": 1.6267319926408939,
"grad_norm": 32.52813720703125,
"learning_rate": 9.6238548273432e-07,
"loss": 0.4001,
"step": 9520
},
{
"epoch": 1.6284406403852802,
"grad_norm": 29.446916580200195,
"learning_rate": 9.579809725158564e-07,
"loss": 0.3865,
"step": 9530
},
{
"epoch": 1.6301492881296666,
"grad_norm": 22.962326049804688,
"learning_rate": 9.535764622973927e-07,
"loss": 0.3399,
"step": 9540
},
{
"epoch": 1.6318579358740528,
"grad_norm": 22.97249984741211,
"learning_rate": 9.491719520789289e-07,
"loss": 0.3967,
"step": 9550
},
{
"epoch": 1.633566583618439,
"grad_norm": 40.018470764160156,
"learning_rate": 9.447674418604652e-07,
"loss": 0.3935,
"step": 9560
},
{
"epoch": 1.6352752313628254,
"grad_norm": 22.444059371948242,
"learning_rate": 9.403629316420016e-07,
"loss": 0.3224,
"step": 9570
},
{
"epoch": 1.6369838791072118,
"grad_norm": 34.330078125,
"learning_rate": 9.359584214235377e-07,
"loss": 0.3486,
"step": 9580
},
{
"epoch": 1.638692526851598,
"grad_norm": 35.540557861328125,
"learning_rate": 9.315539112050741e-07,
"loss": 0.3563,
"step": 9590
},
{
"epoch": 1.6404011745959843,
"grad_norm": 24.032527923583984,
"learning_rate": 9.271494009866105e-07,
"loss": 0.3191,
"step": 9600
},
{
"epoch": 1.6421098223403705,
"grad_norm": 38.39560317993164,
"learning_rate": 9.227448907681466e-07,
"loss": 0.3694,
"step": 9610
},
{
"epoch": 1.643818470084757,
"grad_norm": 40.29669952392578,
"learning_rate": 9.18340380549683e-07,
"loss": 0.3968,
"step": 9620
},
{
"epoch": 1.6455271178291433,
"grad_norm": 28.967849731445312,
"learning_rate": 9.139358703312193e-07,
"loss": 0.3501,
"step": 9630
},
{
"epoch": 1.6472357655735295,
"grad_norm": 44.81010437011719,
"learning_rate": 9.095313601127555e-07,
"loss": 0.415,
"step": 9640
},
{
"epoch": 1.6489444133179156,
"grad_norm": 25.93589210510254,
"learning_rate": 9.051268498942918e-07,
"loss": 0.3933,
"step": 9650
},
{
"epoch": 1.650653061062302,
"grad_norm": 31.824234008789062,
"learning_rate": 9.007223396758282e-07,
"loss": 0.3739,
"step": 9660
},
{
"epoch": 1.6523617088066884,
"grad_norm": 34.3546142578125,
"learning_rate": 8.963178294573645e-07,
"loss": 0.3996,
"step": 9670
},
{
"epoch": 1.6540703565510748,
"grad_norm": 23.639925003051758,
"learning_rate": 8.919133192389007e-07,
"loss": 0.3853,
"step": 9680
},
{
"epoch": 1.655779004295461,
"grad_norm": 30.642179489135742,
"learning_rate": 8.875088090204371e-07,
"loss": 0.3762,
"step": 9690
},
{
"epoch": 1.6574876520398472,
"grad_norm": 30.923620223999023,
"learning_rate": 8.831042988019733e-07,
"loss": 0.3923,
"step": 9700
},
{
"epoch": 1.6591962997842336,
"grad_norm": 27.91309356689453,
"learning_rate": 8.786997885835096e-07,
"loss": 0.3604,
"step": 9710
},
{
"epoch": 1.66090494752862,
"grad_norm": 23.54095458984375,
"learning_rate": 8.742952783650459e-07,
"loss": 0.3512,
"step": 9720
},
{
"epoch": 1.6626135952730061,
"grad_norm": 31.084632873535156,
"learning_rate": 8.698907681465822e-07,
"loss": 0.3475,
"step": 9730
},
{
"epoch": 1.6643222430173923,
"grad_norm": 34.60007095336914,
"learning_rate": 8.654862579281184e-07,
"loss": 0.3999,
"step": 9740
},
{
"epoch": 1.6660308907617787,
"grad_norm": 32.12785339355469,
"learning_rate": 8.610817477096548e-07,
"loss": 0.4169,
"step": 9750
},
{
"epoch": 1.667739538506165,
"grad_norm": 26.730180740356445,
"learning_rate": 8.566772374911911e-07,
"loss": 0.3191,
"step": 9760
},
{
"epoch": 1.6694481862505515,
"grad_norm": 29.191030502319336,
"learning_rate": 8.522727272727273e-07,
"loss": 0.3468,
"step": 9770
},
{
"epoch": 1.6711568339949376,
"grad_norm": 30.42900848388672,
"learning_rate": 8.478682170542637e-07,
"loss": 0.3948,
"step": 9780
},
{
"epoch": 1.6728654817393238,
"grad_norm": 36.10079574584961,
"learning_rate": 8.434637068357999e-07,
"loss": 0.3741,
"step": 9790
},
{
"epoch": 1.6745741294837102,
"grad_norm": 24.84588050842285,
"learning_rate": 8.390591966173363e-07,
"loss": 0.3342,
"step": 9800
},
{
"epoch": 1.6762827772280966,
"grad_norm": 28.362817764282227,
"learning_rate": 8.346546863988725e-07,
"loss": 0.3723,
"step": 9810
},
{
"epoch": 1.6779914249724828,
"grad_norm": 31.064945220947266,
"learning_rate": 8.302501761804088e-07,
"loss": 0.3648,
"step": 9820
},
{
"epoch": 1.6797000727168692,
"grad_norm": 43.73317337036133,
"learning_rate": 8.258456659619452e-07,
"loss": 0.3849,
"step": 9830
},
{
"epoch": 1.6814087204612553,
"grad_norm": 36.45133590698242,
"learning_rate": 8.214411557434814e-07,
"loss": 0.4135,
"step": 9840
},
{
"epoch": 1.6831173682056417,
"grad_norm": 24.040943145751953,
"learning_rate": 8.170366455250177e-07,
"loss": 0.3828,
"step": 9850
},
{
"epoch": 1.6848260159500281,
"grad_norm": 34.76506805419922,
"learning_rate": 8.12632135306554e-07,
"loss": 0.3583,
"step": 9860
},
{
"epoch": 1.6865346636944143,
"grad_norm": 34.13239669799805,
"learning_rate": 8.082276250880903e-07,
"loss": 0.3613,
"step": 9870
},
{
"epoch": 1.6882433114388005,
"grad_norm": 25.49158477783203,
"learning_rate": 8.038231148696265e-07,
"loss": 0.4262,
"step": 9880
},
{
"epoch": 1.6899519591831869,
"grad_norm": 35.75178909301758,
"learning_rate": 7.994186046511629e-07,
"loss": 0.4004,
"step": 9890
},
{
"epoch": 1.6916606069275733,
"grad_norm": 32.253150939941406,
"learning_rate": 7.95014094432699e-07,
"loss": 0.3644,
"step": 9900
},
{
"epoch": 1.6933692546719596,
"grad_norm": 37.89906692504883,
"learning_rate": 7.906095842142354e-07,
"loss": 0.3617,
"step": 9910
},
{
"epoch": 1.6950779024163458,
"grad_norm": 21.644926071166992,
"learning_rate": 7.862050739957718e-07,
"loss": 0.3764,
"step": 9920
},
{
"epoch": 1.696786550160732,
"grad_norm": 34.483253479003906,
"learning_rate": 7.818005637773081e-07,
"loss": 0.4131,
"step": 9930
},
{
"epoch": 1.6984951979051184,
"grad_norm": 43.877708435058594,
"learning_rate": 7.773960535588443e-07,
"loss": 0.3782,
"step": 9940
},
{
"epoch": 1.7002038456495048,
"grad_norm": 56.46201705932617,
"learning_rate": 7.729915433403806e-07,
"loss": 0.3922,
"step": 9950
},
{
"epoch": 1.701912493393891,
"grad_norm": 30.294981002807617,
"learning_rate": 7.68587033121917e-07,
"loss": 0.3839,
"step": 9960
},
{
"epoch": 1.7036211411382773,
"grad_norm": 36.37797927856445,
"learning_rate": 7.641825229034531e-07,
"loss": 0.37,
"step": 9970
},
{
"epoch": 1.7053297888826635,
"grad_norm": 32.37224197387695,
"learning_rate": 7.597780126849895e-07,
"loss": 0.3552,
"step": 9980
},
{
"epoch": 1.70703843662705,
"grad_norm": 37.46088790893555,
"learning_rate": 7.553735024665259e-07,
"loss": 0.3815,
"step": 9990
},
{
"epoch": 1.7087470843714363,
"grad_norm": 32.850372314453125,
"learning_rate": 7.50968992248062e-07,
"loss": 0.3688,
"step": 10000
},
{
"epoch": 1.7104557321158225,
"grad_norm": 47.176239013671875,
"learning_rate": 7.465644820295984e-07,
"loss": 0.3789,
"step": 10010
},
{
"epoch": 1.7121643798602086,
"grad_norm": 24.945432662963867,
"learning_rate": 7.421599718111347e-07,
"loss": 0.382,
"step": 10020
},
{
"epoch": 1.713873027604595,
"grad_norm": 21.04591941833496,
"learning_rate": 7.377554615926709e-07,
"loss": 0.3695,
"step": 10030
},
{
"epoch": 1.7155816753489814,
"grad_norm": 33.52159881591797,
"learning_rate": 7.333509513742072e-07,
"loss": 0.3597,
"step": 10040
},
{
"epoch": 1.7172903230933678,
"grad_norm": 30.122079849243164,
"learning_rate": 7.289464411557436e-07,
"loss": 0.3875,
"step": 10050
},
{
"epoch": 1.718998970837754,
"grad_norm": 24.38621711730957,
"learning_rate": 7.245419309372799e-07,
"loss": 0.3351,
"step": 10060
},
{
"epoch": 1.7207076185821402,
"grad_norm": 47.98723220825195,
"learning_rate": 7.201374207188161e-07,
"loss": 0.3836,
"step": 10070
},
{
"epoch": 1.7224162663265266,
"grad_norm": 37.2187614440918,
"learning_rate": 7.157329105003525e-07,
"loss": 0.3614,
"step": 10080
},
{
"epoch": 1.724124914070913,
"grad_norm": 30.610862731933594,
"learning_rate": 7.113284002818887e-07,
"loss": 0.3795,
"step": 10090
},
{
"epoch": 1.7258335618152991,
"grad_norm": 22.508331298828125,
"learning_rate": 7.06923890063425e-07,
"loss": 0.3536,
"step": 10100
},
{
"epoch": 1.7275422095596853,
"grad_norm": 37.26981735229492,
"learning_rate": 7.025193798449613e-07,
"loss": 0.3832,
"step": 10110
},
{
"epoch": 1.7292508573040717,
"grad_norm": 19.104637145996094,
"learning_rate": 6.981148696264976e-07,
"loss": 0.3522,
"step": 10120
},
{
"epoch": 1.730959505048458,
"grad_norm": 23.52967071533203,
"learning_rate": 6.937103594080338e-07,
"loss": 0.391,
"step": 10130
},
{
"epoch": 1.7326681527928445,
"grad_norm": 27.223722457885742,
"learning_rate": 6.893058491895702e-07,
"loss": 0.3904,
"step": 10140
},
{
"epoch": 1.7343768005372306,
"grad_norm": 30.344676971435547,
"learning_rate": 6.849013389711065e-07,
"loss": 0.391,
"step": 10150
},
{
"epoch": 1.7360854482816168,
"grad_norm": 22.83699607849121,
"learning_rate": 6.804968287526427e-07,
"loss": 0.3653,
"step": 10160
},
{
"epoch": 1.7377940960260032,
"grad_norm": 32.153663635253906,
"learning_rate": 6.760923185341791e-07,
"loss": 0.3531,
"step": 10170
},
{
"epoch": 1.7395027437703896,
"grad_norm": 36.864925384521484,
"learning_rate": 6.716878083157153e-07,
"loss": 0.3391,
"step": 10180
},
{
"epoch": 1.7412113915147758,
"grad_norm": 41.715576171875,
"learning_rate": 6.672832980972517e-07,
"loss": 0.3993,
"step": 10190
},
{
"epoch": 1.7429200392591622,
"grad_norm": 35.69621276855469,
"learning_rate": 6.628787878787879e-07,
"loss": 0.3121,
"step": 10200
},
{
"epoch": 1.7446286870035483,
"grad_norm": 33.0884895324707,
"learning_rate": 6.584742776603242e-07,
"loss": 0.3827,
"step": 10210
},
{
"epoch": 1.7463373347479347,
"grad_norm": 26.627431869506836,
"learning_rate": 6.540697674418606e-07,
"loss": 0.3759,
"step": 10220
},
{
"epoch": 1.7480459824923211,
"grad_norm": 32.8358039855957,
"learning_rate": 6.496652572233968e-07,
"loss": 0.3983,
"step": 10230
},
{
"epoch": 1.7497546302367073,
"grad_norm": 26.243980407714844,
"learning_rate": 6.452607470049331e-07,
"loss": 0.3857,
"step": 10240
},
{
"epoch": 1.7514632779810935,
"grad_norm": 26.84737205505371,
"learning_rate": 6.408562367864694e-07,
"loss": 0.3771,
"step": 10250
},
{
"epoch": 1.7531719257254799,
"grad_norm": 31.410524368286133,
"learning_rate": 6.364517265680057e-07,
"loss": 0.3794,
"step": 10260
},
{
"epoch": 1.7548805734698663,
"grad_norm": 38.454044342041016,
"learning_rate": 6.320472163495419e-07,
"loss": 0.3595,
"step": 10270
},
{
"epoch": 1.7565892212142526,
"grad_norm": 18.861108779907227,
"learning_rate": 6.276427061310783e-07,
"loss": 0.3457,
"step": 10280
},
{
"epoch": 1.7582978689586388,
"grad_norm": 28.400564193725586,
"learning_rate": 6.232381959126146e-07,
"loss": 0.3511,
"step": 10290
},
{
"epoch": 1.760006516703025,
"grad_norm": 47.41775894165039,
"learning_rate": 6.188336856941508e-07,
"loss": 0.3544,
"step": 10300
},
{
"epoch": 1.7617151644474114,
"grad_norm": 30.009010314941406,
"learning_rate": 6.144291754756872e-07,
"loss": 0.3273,
"step": 10310
},
{
"epoch": 1.7634238121917978,
"grad_norm": 25.67041778564453,
"learning_rate": 6.100246652572235e-07,
"loss": 0.3619,
"step": 10320
},
{
"epoch": 1.765132459936184,
"grad_norm": 28.06591796875,
"learning_rate": 6.056201550387597e-07,
"loss": 0.3577,
"step": 10330
},
{
"epoch": 1.7668411076805703,
"grad_norm": 25.041889190673828,
"learning_rate": 6.01215644820296e-07,
"loss": 0.3688,
"step": 10340
},
{
"epoch": 1.7685497554249565,
"grad_norm": 32.429443359375,
"learning_rate": 5.968111346018323e-07,
"loss": 0.4009,
"step": 10350
},
{
"epoch": 1.770258403169343,
"grad_norm": 23.519460678100586,
"learning_rate": 5.924066243833686e-07,
"loss": 0.366,
"step": 10360
},
{
"epoch": 1.7719670509137293,
"grad_norm": 36.32727813720703,
"learning_rate": 5.880021141649049e-07,
"loss": 0.4004,
"step": 10370
},
{
"epoch": 1.7736756986581155,
"grad_norm": 30.008052825927734,
"learning_rate": 5.835976039464412e-07,
"loss": 0.3707,
"step": 10380
},
{
"epoch": 1.7753843464025016,
"grad_norm": 34.22142791748047,
"learning_rate": 5.791930937279775e-07,
"loss": 0.3848,
"step": 10390
},
{
"epoch": 1.777092994146888,
"grad_norm": 21.506912231445312,
"learning_rate": 5.747885835095138e-07,
"loss": 0.3677,
"step": 10400
},
{
"epoch": 1.7788016418912744,
"grad_norm": 33.4599609375,
"learning_rate": 5.703840732910502e-07,
"loss": 0.394,
"step": 10410
},
{
"epoch": 1.7805102896356608,
"grad_norm": 36.893394470214844,
"learning_rate": 5.659795630725864e-07,
"loss": 0.3871,
"step": 10420
},
{
"epoch": 1.782218937380047,
"grad_norm": 46.39961624145508,
"learning_rate": 5.615750528541227e-07,
"loss": 0.4025,
"step": 10430
},
{
"epoch": 1.7839275851244332,
"grad_norm": 23.366689682006836,
"learning_rate": 5.57170542635659e-07,
"loss": 0.3694,
"step": 10440
},
{
"epoch": 1.7856362328688196,
"grad_norm": 61.47678756713867,
"learning_rate": 5.527660324171952e-07,
"loss": 0.3759,
"step": 10450
},
{
"epoch": 1.787344880613206,
"grad_norm": 27.12241554260254,
"learning_rate": 5.483615221987316e-07,
"loss": 0.3552,
"step": 10460
},
{
"epoch": 1.7890535283575921,
"grad_norm": 29.733963012695312,
"learning_rate": 5.439570119802679e-07,
"loss": 0.3966,
"step": 10470
},
{
"epoch": 1.7907621761019783,
"grad_norm": 29.353418350219727,
"learning_rate": 5.395525017618041e-07,
"loss": 0.3642,
"step": 10480
},
{
"epoch": 1.7924708238463647,
"grad_norm": 26.14151954650879,
"learning_rate": 5.351479915433405e-07,
"loss": 0.3776,
"step": 10490
},
{
"epoch": 1.794179471590751,
"grad_norm": 33.61710739135742,
"learning_rate": 5.307434813248768e-07,
"loss": 0.3484,
"step": 10500
},
{
"epoch": 1.7958881193351375,
"grad_norm": 25.79818344116211,
"learning_rate": 5.26338971106413e-07,
"loss": 0.3498,
"step": 10510
},
{
"epoch": 1.7975967670795236,
"grad_norm": 34.903533935546875,
"learning_rate": 5.219344608879493e-07,
"loss": 0.4224,
"step": 10520
},
{
"epoch": 1.7993054148239098,
"grad_norm": 42.71726608276367,
"learning_rate": 5.175299506694856e-07,
"loss": 0.3524,
"step": 10530
},
{
"epoch": 1.8010140625682962,
"grad_norm": 35.538875579833984,
"learning_rate": 5.131254404510219e-07,
"loss": 0.3471,
"step": 10540
},
{
"epoch": 1.8027227103126826,
"grad_norm": 34.071388244628906,
"learning_rate": 5.087209302325582e-07,
"loss": 0.3994,
"step": 10550
},
{
"epoch": 1.8044313580570688,
"grad_norm": 30.25017738342285,
"learning_rate": 5.043164200140945e-07,
"loss": 0.4127,
"step": 10560
},
{
"epoch": 1.8061400058014552,
"grad_norm": 27.28138542175293,
"learning_rate": 4.999119097956308e-07,
"loss": 0.3286,
"step": 10570
},
{
"epoch": 1.8078486535458413,
"grad_norm": 38.14504623413086,
"learning_rate": 4.955073995771671e-07,
"loss": 0.3763,
"step": 10580
},
{
"epoch": 1.8095573012902277,
"grad_norm": 30.078149795532227,
"learning_rate": 4.911028893587034e-07,
"loss": 0.352,
"step": 10590
},
{
"epoch": 1.8112659490346141,
"grad_norm": 24.926767349243164,
"learning_rate": 4.866983791402396e-07,
"loss": 0.3421,
"step": 10600
},
{
"epoch": 1.8129745967790003,
"grad_norm": 46.622650146484375,
"learning_rate": 4.822938689217759e-07,
"loss": 0.3835,
"step": 10610
},
{
"epoch": 1.8146832445233865,
"grad_norm": 21.296682357788086,
"learning_rate": 4.778893587033123e-07,
"loss": 0.4098,
"step": 10620
},
{
"epoch": 1.8163918922677729,
"grad_norm": 27.754459381103516,
"learning_rate": 4.7348484848484853e-07,
"loss": 0.3576,
"step": 10630
},
{
"epoch": 1.8181005400121593,
"grad_norm": 26.44339942932129,
"learning_rate": 4.690803382663848e-07,
"loss": 0.3411,
"step": 10640
},
{
"epoch": 1.8198091877565457,
"grad_norm": 24.727949142456055,
"learning_rate": 4.646758280479211e-07,
"loss": 0.333,
"step": 10650
},
{
"epoch": 1.8215178355009318,
"grad_norm": 36.50139617919922,
"learning_rate": 4.602713178294574e-07,
"loss": 0.3346,
"step": 10660
},
{
"epoch": 1.823226483245318,
"grad_norm": 32.99855041503906,
"learning_rate": 4.5586680761099375e-07,
"loss": 0.4303,
"step": 10670
},
{
"epoch": 1.8249351309897044,
"grad_norm": 23.56210708618164,
"learning_rate": 4.5146229739253e-07,
"loss": 0.3567,
"step": 10680
},
{
"epoch": 1.8266437787340908,
"grad_norm": 32.45067596435547,
"learning_rate": 4.470577871740663e-07,
"loss": 0.3615,
"step": 10690
},
{
"epoch": 1.828352426478477,
"grad_norm": 26.946245193481445,
"learning_rate": 4.426532769556026e-07,
"loss": 0.3577,
"step": 10700
},
{
"epoch": 1.8300610742228633,
"grad_norm": 25.937786102294922,
"learning_rate": 4.3824876673713886e-07,
"loss": 0.3657,
"step": 10710
},
{
"epoch": 1.8317697219672495,
"grad_norm": 22.442626953125,
"learning_rate": 4.338442565186752e-07,
"loss": 0.3988,
"step": 10720
},
{
"epoch": 1.833478369711636,
"grad_norm": 35.23172378540039,
"learning_rate": 4.2943974630021144e-07,
"loss": 0.3931,
"step": 10730
},
{
"epoch": 1.8351870174560223,
"grad_norm": 36.66183090209961,
"learning_rate": 4.250352360817477e-07,
"loss": 0.4132,
"step": 10740
},
{
"epoch": 1.8368956652004085,
"grad_norm": 38.16518783569336,
"learning_rate": 4.206307258632841e-07,
"loss": 0.3592,
"step": 10750
},
{
"epoch": 1.8386043129447946,
"grad_norm": 29.179283142089844,
"learning_rate": 4.1622621564482034e-07,
"loss": 0.376,
"step": 10760
},
{
"epoch": 1.840312960689181,
"grad_norm": 32.8124885559082,
"learning_rate": 4.118217054263566e-07,
"loss": 0.4341,
"step": 10770
},
{
"epoch": 1.8420216084335674,
"grad_norm": 34.435943603515625,
"learning_rate": 4.0741719520789293e-07,
"loss": 0.3602,
"step": 10780
},
{
"epoch": 1.8437302561779538,
"grad_norm": 23.411712646484375,
"learning_rate": 4.030126849894292e-07,
"loss": 0.326,
"step": 10790
},
{
"epoch": 1.84543890392234,
"grad_norm": 39.64480209350586,
"learning_rate": 3.986081747709655e-07,
"loss": 0.393,
"step": 10800
},
{
"epoch": 1.8471475516667262,
"grad_norm": 30.89308738708496,
"learning_rate": 3.942036645525018e-07,
"loss": 0.3746,
"step": 10810
},
{
"epoch": 1.8488561994111126,
"grad_norm": 27.453231811523438,
"learning_rate": 3.8979915433403804e-07,
"loss": 0.3775,
"step": 10820
},
{
"epoch": 1.850564847155499,
"grad_norm": 39.444679260253906,
"learning_rate": 3.853946441155744e-07,
"loss": 0.3238,
"step": 10830
},
{
"epoch": 1.8522734948998851,
"grad_norm": 25.87100601196289,
"learning_rate": 3.809901338971107e-07,
"loss": 0.3351,
"step": 10840
},
{
"epoch": 1.8539821426442713,
"grad_norm": 38.50906753540039,
"learning_rate": 3.76585623678647e-07,
"loss": 0.4047,
"step": 10850
},
{
"epoch": 1.8556907903886577,
"grad_norm": 28.932676315307617,
"learning_rate": 3.7218111346018326e-07,
"loss": 0.3853,
"step": 10860
},
{
"epoch": 1.857399438133044,
"grad_norm": 38.7553596496582,
"learning_rate": 3.6777660324171953e-07,
"loss": 0.4117,
"step": 10870
},
{
"epoch": 1.8591080858774305,
"grad_norm": 37.8046760559082,
"learning_rate": 3.6337209302325584e-07,
"loss": 0.3444,
"step": 10880
},
{
"epoch": 1.8608167336218167,
"grad_norm": 37.61636734008789,
"learning_rate": 3.589675828047921e-07,
"loss": 0.37,
"step": 10890
},
{
"epoch": 1.8625253813662028,
"grad_norm": 31.169891357421875,
"learning_rate": 3.545630725863284e-07,
"loss": 0.3625,
"step": 10900
},
{
"epoch": 1.8642340291105892,
"grad_norm": 33.97384262084961,
"learning_rate": 3.5015856236786475e-07,
"loss": 0.3595,
"step": 10910
},
{
"epoch": 1.8659426768549756,
"grad_norm": 33.35996627807617,
"learning_rate": 3.45754052149401e-07,
"loss": 0.367,
"step": 10920
},
{
"epoch": 1.867651324599362,
"grad_norm": 31.67682647705078,
"learning_rate": 3.4134954193093733e-07,
"loss": 0.3905,
"step": 10930
},
{
"epoch": 1.8693599723437482,
"grad_norm": 34.16012954711914,
"learning_rate": 3.369450317124736e-07,
"loss": 0.3639,
"step": 10940
},
{
"epoch": 1.8710686200881343,
"grad_norm": 38.885986328125,
"learning_rate": 3.3254052149400986e-07,
"loss": 0.3593,
"step": 10950
},
{
"epoch": 1.8727772678325207,
"grad_norm": 35.09337615966797,
"learning_rate": 3.281360112755462e-07,
"loss": 0.4085,
"step": 10960
},
{
"epoch": 1.8744859155769071,
"grad_norm": 36.90644073486328,
"learning_rate": 3.2373150105708244e-07,
"loss": 0.3615,
"step": 10970
},
{
"epoch": 1.8761945633212933,
"grad_norm": 25.444183349609375,
"learning_rate": 3.193269908386188e-07,
"loss": 0.3644,
"step": 10980
},
{
"epoch": 1.8779032110656795,
"grad_norm": 30.481740951538086,
"learning_rate": 3.149224806201551e-07,
"loss": 0.378,
"step": 10990
},
{
"epoch": 1.8796118588100659,
"grad_norm": 37.78234100341797,
"learning_rate": 3.1051797040169134e-07,
"loss": 0.387,
"step": 11000
},
{
"epoch": 1.8813205065544523,
"grad_norm": 25.613048553466797,
"learning_rate": 3.061134601832276e-07,
"loss": 0.3335,
"step": 11010
},
{
"epoch": 1.8830291542988387,
"grad_norm": 39.77134323120117,
"learning_rate": 3.0170894996476393e-07,
"loss": 0.3843,
"step": 11020
},
{
"epoch": 1.8847378020432248,
"grad_norm": 22.53700065612793,
"learning_rate": 2.9730443974630025e-07,
"loss": 0.3685,
"step": 11030
},
{
"epoch": 1.886446449787611,
"grad_norm": 44.945308685302734,
"learning_rate": 2.928999295278365e-07,
"loss": 0.3302,
"step": 11040
},
{
"epoch": 1.8881550975319974,
"grad_norm": 31.36821174621582,
"learning_rate": 2.8849541930937283e-07,
"loss": 0.3876,
"step": 11050
},
{
"epoch": 1.8898637452763838,
"grad_norm": 38.52021408081055,
"learning_rate": 2.840909090909091e-07,
"loss": 0.3741,
"step": 11060
},
{
"epoch": 1.89157239302077,
"grad_norm": 40.30624008178711,
"learning_rate": 2.796863988724454e-07,
"loss": 0.3921,
"step": 11070
},
{
"epoch": 1.8932810407651564,
"grad_norm": 29.259140014648438,
"learning_rate": 2.7528188865398173e-07,
"loss": 0.3893,
"step": 11080
},
{
"epoch": 1.8949896885095425,
"grad_norm": 25.17171287536621,
"learning_rate": 2.70877378435518e-07,
"loss": 0.3756,
"step": 11090
},
{
"epoch": 1.896698336253929,
"grad_norm": 37.15606689453125,
"learning_rate": 2.6647286821705426e-07,
"loss": 0.4048,
"step": 11100
},
{
"epoch": 1.8984069839983153,
"grad_norm": 24.475324630737305,
"learning_rate": 2.620683579985906e-07,
"loss": 0.3709,
"step": 11110
},
{
"epoch": 1.9001156317427015,
"grad_norm": 28.089601516723633,
"learning_rate": 2.576638477801269e-07,
"loss": 0.3933,
"step": 11120
},
{
"epoch": 1.9018242794870877,
"grad_norm": 24.580224990844727,
"learning_rate": 2.5325933756166316e-07,
"loss": 0.3557,
"step": 11130
},
{
"epoch": 1.903532927231474,
"grad_norm": 31.057662963867188,
"learning_rate": 2.4885482734319943e-07,
"loss": 0.3912,
"step": 11140
},
{
"epoch": 1.9052415749758604,
"grad_norm": 36.91437530517578,
"learning_rate": 2.4445031712473575e-07,
"loss": 0.363,
"step": 11150
},
{
"epoch": 1.9069502227202468,
"grad_norm": 28.377185821533203,
"learning_rate": 2.4004580690627206e-07,
"loss": 0.41,
"step": 11160
},
{
"epoch": 1.908658870464633,
"grad_norm": 33.51364517211914,
"learning_rate": 2.3564129668780836e-07,
"loss": 0.362,
"step": 11170
},
{
"epoch": 1.9103675182090192,
"grad_norm": 23.851999282836914,
"learning_rate": 2.3123678646934465e-07,
"loss": 0.3516,
"step": 11180
},
{
"epoch": 1.9120761659534056,
"grad_norm": 27.512645721435547,
"learning_rate": 2.268322762508809e-07,
"loss": 0.3736,
"step": 11190
},
{
"epoch": 1.913784813697792,
"grad_norm": 33.09054183959961,
"learning_rate": 2.224277660324172e-07,
"loss": 0.3536,
"step": 11200
},
{
"epoch": 1.9154934614421781,
"grad_norm": 39.228851318359375,
"learning_rate": 2.1802325581395352e-07,
"loss": 0.3922,
"step": 11210
},
{
"epoch": 1.9172021091865643,
"grad_norm": 27.592710494995117,
"learning_rate": 2.1361874559548981e-07,
"loss": 0.3146,
"step": 11220
},
{
"epoch": 1.9189107569309507,
"grad_norm": 50.0390739440918,
"learning_rate": 2.0921423537702608e-07,
"loss": 0.3441,
"step": 11230
},
{
"epoch": 1.920619404675337,
"grad_norm": 39.61098098754883,
"learning_rate": 2.0480972515856237e-07,
"loss": 0.3493,
"step": 11240
},
{
"epoch": 1.9223280524197235,
"grad_norm": 38.638954162597656,
"learning_rate": 2.004052149400987e-07,
"loss": 0.3626,
"step": 11250
},
{
"epoch": 1.9240367001641097,
"grad_norm": 29.187583923339844,
"learning_rate": 1.9600070472163498e-07,
"loss": 0.3438,
"step": 11260
},
{
"epoch": 1.9257453479084958,
"grad_norm": 28.951478958129883,
"learning_rate": 1.9159619450317125e-07,
"loss": 0.3602,
"step": 11270
},
{
"epoch": 1.9274539956528822,
"grad_norm": 23.78569984436035,
"learning_rate": 1.8719168428470754e-07,
"loss": 0.4069,
"step": 11280
},
{
"epoch": 1.9291626433972686,
"grad_norm": 28.406557083129883,
"learning_rate": 1.8278717406624386e-07,
"loss": 0.3304,
"step": 11290
},
{
"epoch": 1.930871291141655,
"grad_norm": 36.53167724609375,
"learning_rate": 1.7838266384778015e-07,
"loss": 0.3741,
"step": 11300
},
{
"epoch": 1.9325799388860412,
"grad_norm": 39.00297164916992,
"learning_rate": 1.7397815362931644e-07,
"loss": 0.3252,
"step": 11310
},
{
"epoch": 1.9342885866304274,
"grad_norm": 37.43632888793945,
"learning_rate": 1.695736434108527e-07,
"loss": 0.3953,
"step": 11320
},
{
"epoch": 1.9359972343748137,
"grad_norm": 34.39417266845703,
"learning_rate": 1.6516913319238902e-07,
"loss": 0.4112,
"step": 11330
},
{
"epoch": 1.9377058821192001,
"grad_norm": 31.974533081054688,
"learning_rate": 1.6076462297392531e-07,
"loss": 0.3451,
"step": 11340
},
{
"epoch": 1.9394145298635863,
"grad_norm": 26.460182189941406,
"learning_rate": 1.563601127554616e-07,
"loss": 0.318,
"step": 11350
},
{
"epoch": 1.9411231776079725,
"grad_norm": 37.38439178466797,
"learning_rate": 1.519556025369979e-07,
"loss": 0.3795,
"step": 11360
},
{
"epoch": 1.9428318253523589,
"grad_norm": 21.876747131347656,
"learning_rate": 1.475510923185342e-07,
"loss": 0.3331,
"step": 11370
},
{
"epoch": 1.9445404730967453,
"grad_norm": 38.800811767578125,
"learning_rate": 1.4314658210007048e-07,
"loss": 0.4218,
"step": 11380
},
{
"epoch": 1.9462491208411317,
"grad_norm": 32.05302810668945,
"learning_rate": 1.3874207188160677e-07,
"loss": 0.3484,
"step": 11390
},
{
"epoch": 1.9479577685855178,
"grad_norm": 28.057106018066406,
"learning_rate": 1.3433756166314306e-07,
"loss": 0.3579,
"step": 11400
},
{
"epoch": 1.949666416329904,
"grad_norm": 34.402259826660156,
"learning_rate": 1.2993305144467938e-07,
"loss": 0.3871,
"step": 11410
},
{
"epoch": 1.9513750640742904,
"grad_norm": 23.892139434814453,
"learning_rate": 1.2552854122621565e-07,
"loss": 0.3813,
"step": 11420
},
{
"epoch": 1.9530837118186768,
"grad_norm": 33.99159240722656,
"learning_rate": 1.2112403100775197e-07,
"loss": 0.3698,
"step": 11430
},
{
"epoch": 1.954792359563063,
"grad_norm": 39.73171615600586,
"learning_rate": 1.1671952078928824e-07,
"loss": 0.3888,
"step": 11440
},
{
"epoch": 1.9565010073074494,
"grad_norm": 40.77421951293945,
"learning_rate": 1.1231501057082454e-07,
"loss": 0.3514,
"step": 11450
},
{
"epoch": 1.9582096550518355,
"grad_norm": 32.23746871948242,
"learning_rate": 1.0791050035236083e-07,
"loss": 0.3555,
"step": 11460
},
{
"epoch": 1.959918302796222,
"grad_norm": 37.01948165893555,
"learning_rate": 1.0350599013389712e-07,
"loss": 0.3536,
"step": 11470
},
{
"epoch": 1.9616269505406083,
"grad_norm": 29.169981002807617,
"learning_rate": 9.910147991543341e-08,
"loss": 0.3298,
"step": 11480
},
{
"epoch": 1.9633355982849945,
"grad_norm": 21.912952423095703,
"learning_rate": 9.46969696969697e-08,
"loss": 0.2856,
"step": 11490
},
{
"epoch": 1.9650442460293807,
"grad_norm": 31.11917495727539,
"learning_rate": 9.0292459478506e-08,
"loss": 0.3485,
"step": 11500
},
{
"epoch": 1.966752893773767,
"grad_norm": 38.79056167602539,
"learning_rate": 8.588794926004229e-08,
"loss": 0.3723,
"step": 11510
},
{
"epoch": 1.9684615415181534,
"grad_norm": 26.17238426208496,
"learning_rate": 8.148343904157858e-08,
"loss": 0.3234,
"step": 11520
},
{
"epoch": 1.9701701892625398,
"grad_norm": 32.50604248046875,
"learning_rate": 7.707892882311487e-08,
"loss": 0.3453,
"step": 11530
},
{
"epoch": 1.971878837006926,
"grad_norm": 42.897769927978516,
"learning_rate": 7.267441860465117e-08,
"loss": 0.3336,
"step": 11540
},
{
"epoch": 1.9735874847513122,
"grad_norm": 31.65463638305664,
"learning_rate": 6.826990838618747e-08,
"loss": 0.3547,
"step": 11550
},
{
"epoch": 1.9752961324956986,
"grad_norm": 36.5434684753418,
"learning_rate": 6.386539816772376e-08,
"loss": 0.3433,
"step": 11560
},
{
"epoch": 1.977004780240085,
"grad_norm": 20.959672927856445,
"learning_rate": 5.946088794926004e-08,
"loss": 0.331,
"step": 11570
},
{
"epoch": 1.9787134279844711,
"grad_norm": 32.63426971435547,
"learning_rate": 5.5056377730796334e-08,
"loss": 0.3522,
"step": 11580
},
{
"epoch": 1.9804220757288573,
"grad_norm": 30.64798927307129,
"learning_rate": 5.065186751233264e-08,
"loss": 0.3582,
"step": 11590
},
{
"epoch": 1.9821307234732437,
"grad_norm": 26.718338012695312,
"learning_rate": 4.624735729386893e-08,
"loss": 0.3232,
"step": 11600
},
{
"epoch": 1.98383937121763,
"grad_norm": 34.196048736572266,
"learning_rate": 4.184284707540522e-08,
"loss": 0.329,
"step": 11610
},
{
"epoch": 1.9855480189620165,
"grad_norm": 41.088165283203125,
"learning_rate": 3.7438336856941514e-08,
"loss": 0.3263,
"step": 11620
},
{
"epoch": 1.9872566667064027,
"grad_norm": 30.549293518066406,
"learning_rate": 3.3033826638477806e-08,
"loss": 0.3245,
"step": 11630
},
{
"epoch": 1.9889653144507888,
"grad_norm": 43.00529098510742,
"learning_rate": 2.8629316420014098e-08,
"loss": 0.3949,
"step": 11640
},
{
"epoch": 1.9906739621951752,
"grad_norm": 38.130096435546875,
"learning_rate": 2.422480620155039e-08,
"loss": 0.3707,
"step": 11650
},
{
"epoch": 1.9923826099395616,
"grad_norm": 32.19184112548828,
"learning_rate": 1.982029598308668e-08,
"loss": 0.3876,
"step": 11660
},
{
"epoch": 1.994091257683948,
"grad_norm": 29.53122329711914,
"learning_rate": 1.5415785764622976e-08,
"loss": 0.3743,
"step": 11670
},
{
"epoch": 1.9957999054283342,
"grad_norm": 28.583572387695312,
"learning_rate": 1.1011275546159268e-08,
"loss": 0.3703,
"step": 11680
},
{
"epoch": 1.9975085531727204,
"grad_norm": 22.58035659790039,
"learning_rate": 6.606765327695561e-09,
"loss": 0.3334,
"step": 11690
},
{
"epoch": 1.9992172009171068,
"grad_norm": 40.383663177490234,
"learning_rate": 2.2022551092318538e-09,
"loss": 0.3615,
"step": 11700
},
{
"epoch": 1.9999006600148612,
"eval_loss": 0.954230546951294,
"eval_runtime": 137.9658,
"eval_samples_per_second": 71.46,
"eval_steps_per_second": 8.937,
"step": 11704
}
],
"logging_steps": 10,
"max_steps": 11704,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.0899612102793626e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}