irishprancer's picture
Training in progress, step 4800, checkpoint
d93b081 verified
{
"best_metric": 1.6921895742416382,
"best_model_checkpoint": "./output/checkpoint-4500",
"epoch": 0.15508384220219057,
"eval_steps": 150,
"global_step": 4800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00032309133792123035,
"grad_norm": 17.601825714111328,
"learning_rate": 2.2360679774997904e-06,
"loss": 1.8352,
"step": 10
},
{
"epoch": 0.0006461826758424607,
"grad_norm": 7.03536319732666,
"learning_rate": 4.472135954999581e-06,
"loss": 1.7155,
"step": 20
},
{
"epoch": 0.000969274013763691,
"grad_norm": 9.148643493652344,
"learning_rate": 6.70820393249937e-06,
"loss": 1.6636,
"step": 30
},
{
"epoch": 0.0012923653516849214,
"grad_norm": 8.976370811462402,
"learning_rate": 8.944271909999161e-06,
"loss": 1.7126,
"step": 40
},
{
"epoch": 0.0016154566896061516,
"grad_norm": 8.681303977966309,
"learning_rate": 1.118033988749895e-05,
"loss": 1.8704,
"step": 50
},
{
"epoch": 0.001938548027527382,
"grad_norm": 10.47424030303955,
"learning_rate": 1.341640786499874e-05,
"loss": 1.8515,
"step": 60
},
{
"epoch": 0.0022616393654486125,
"grad_norm": 13.055950164794922,
"learning_rate": 1.565247584249853e-05,
"loss": 1.7705,
"step": 70
},
{
"epoch": 0.0025847307033698428,
"grad_norm": 8.785000801086426,
"learning_rate": 1.7888543819998323e-05,
"loss": 1.91,
"step": 80
},
{
"epoch": 0.002907822041291073,
"grad_norm": 11.010156631469727,
"learning_rate": 2.0124611797498112e-05,
"loss": 1.8089,
"step": 90
},
{
"epoch": 0.0032309133792123032,
"grad_norm": 9.710369110107422,
"learning_rate": 2.23606797749979e-05,
"loss": 1.8483,
"step": 100
},
{
"epoch": 0.0035540047171335335,
"grad_norm": 9.487078666687012,
"learning_rate": 2.236044998500671e-05,
"loss": 1.8135,
"step": 110
},
{
"epoch": 0.003877096055054764,
"grad_norm": 9.57800579071045,
"learning_rate": 2.235976062447891e-05,
"loss": 1.6482,
"step": 120
},
{
"epoch": 0.004200187392975994,
"grad_norm": 7.63389253616333,
"learning_rate": 2.2358611721751407e-05,
"loss": 1.8626,
"step": 130
},
{
"epoch": 0.004523278730897225,
"grad_norm": 11.528849601745605,
"learning_rate": 2.2357003324051093e-05,
"loss": 1.7989,
"step": 140
},
{
"epoch": 0.004846370068818455,
"grad_norm": 12.541647911071777,
"learning_rate": 2.23549354974929e-05,
"loss": 1.7222,
"step": 150
},
{
"epoch": 0.004846370068818455,
"eval_loss": 1.74934720993042,
"eval_runtime": 45.4353,
"eval_samples_per_second": 11.027,
"eval_steps_per_second": 11.027,
"step": 150
},
{
"epoch": 0.0051694614067396855,
"grad_norm": 11.141366958618164,
"learning_rate": 2.2352408327077078e-05,
"loss": 1.7504,
"step": 160
},
{
"epoch": 0.005492552744660916,
"grad_norm": 11.372058868408203,
"learning_rate": 2.2349421916685704e-05,
"loss": 1.8559,
"step": 170
},
{
"epoch": 0.005815644082582146,
"grad_norm": 8.757671356201172,
"learning_rate": 2.234597638907841e-05,
"loss": 1.9035,
"step": 180
},
{
"epoch": 0.006138735420503376,
"grad_norm": 8.273791313171387,
"learning_rate": 2.2342071885887346e-05,
"loss": 1.8452,
"step": 190
},
{
"epoch": 0.0064618267584246065,
"grad_norm": 11.234216690063477,
"learning_rate": 2.2337708567611343e-05,
"loss": 1.9133,
"step": 200
},
{
"epoch": 0.006784918096345837,
"grad_norm": 9.991055488586426,
"learning_rate": 2.233288661360932e-05,
"loss": 1.9328,
"step": 210
},
{
"epoch": 0.007108009434267067,
"grad_norm": 10.776007652282715,
"learning_rate": 2.232760622209293e-05,
"loss": 1.7055,
"step": 220
},
{
"epoch": 0.007431100772188298,
"grad_norm": 7.8985490798950195,
"learning_rate": 2.2321867610118378e-05,
"loss": 1.7584,
"step": 230
},
{
"epoch": 0.007754192110109528,
"grad_norm": 11.129011154174805,
"learning_rate": 2.231567101357753e-05,
"loss": 1.7202,
"step": 240
},
{
"epoch": 0.008077283448030758,
"grad_norm": 10.667845726013184,
"learning_rate": 2.2309016687188194e-05,
"loss": 1.704,
"step": 250
},
{
"epoch": 0.008400374785951988,
"grad_norm": 13.889586448669434,
"learning_rate": 2.230190490448367e-05,
"loss": 1.7678,
"step": 260
},
{
"epoch": 0.008723466123873218,
"grad_norm": 8.101602554321289,
"learning_rate": 2.229433595780149e-05,
"loss": 1.6418,
"step": 270
},
{
"epoch": 0.00904655746179445,
"grad_norm": 11.362483978271484,
"learning_rate": 2.2286310158271407e-05,
"loss": 1.848,
"step": 280
},
{
"epoch": 0.00936964879971568,
"grad_norm": 8.67332935333252,
"learning_rate": 2.22778278358026e-05,
"loss": 1.7263,
"step": 290
},
{
"epoch": 0.00969274013763691,
"grad_norm": 14.691208839416504,
"learning_rate": 2.2268889339070124e-05,
"loss": 1.7203,
"step": 300
},
{
"epoch": 0.00969274013763691,
"eval_loss": 1.7411584854125977,
"eval_runtime": 49.9553,
"eval_samples_per_second": 10.029,
"eval_steps_per_second": 10.029,
"step": 300
},
{
"epoch": 0.01001583147555814,
"grad_norm": 13.153305053710938,
"learning_rate": 2.2259495035500576e-05,
"loss": 1.6979,
"step": 310
},
{
"epoch": 0.010338922813479371,
"grad_norm": 7.682581901550293,
"learning_rate": 2.2249645311256972e-05,
"loss": 1.8388,
"step": 320
},
{
"epoch": 0.010662014151400601,
"grad_norm": 6.891665935516357,
"learning_rate": 2.2239340571222904e-05,
"loss": 1.7465,
"step": 330
},
{
"epoch": 0.010985105489321832,
"grad_norm": 10.929823875427246,
"learning_rate": 2.2228581238985868e-05,
"loss": 1.763,
"step": 340
},
{
"epoch": 0.011308196827243062,
"grad_norm": 9.032389640808105,
"learning_rate": 2.2217367756819878e-05,
"loss": 1.8118,
"step": 350
},
{
"epoch": 0.011631288165164292,
"grad_norm": 13.457771301269531,
"learning_rate": 2.2205700585667257e-05,
"loss": 1.84,
"step": 360
},
{
"epoch": 0.011954379503085522,
"grad_norm": 15.551021575927734,
"learning_rate": 2.2193580205119724e-05,
"loss": 1.7466,
"step": 370
},
{
"epoch": 0.012277470841006752,
"grad_norm": 12.1012601852417,
"learning_rate": 2.2181007113398642e-05,
"loss": 1.5802,
"step": 380
},
{
"epoch": 0.012600562178927983,
"grad_norm": 11.149864196777344,
"learning_rate": 2.216798182733457e-05,
"loss": 1.7217,
"step": 390
},
{
"epoch": 0.012923653516849213,
"grad_norm": 10.7238130569458,
"learning_rate": 2.2154504882346002e-05,
"loss": 1.6624,
"step": 400
},
{
"epoch": 0.013246744854770443,
"grad_norm": 8.965182304382324,
"learning_rate": 2.214057683241736e-05,
"loss": 1.7672,
"step": 410
},
{
"epoch": 0.013569836192691673,
"grad_norm": 7.6960673332214355,
"learning_rate": 2.2126198250076225e-05,
"loss": 1.6535,
"step": 420
},
{
"epoch": 0.013892927530612904,
"grad_norm": 11.47298812866211,
"learning_rate": 2.2111369726369802e-05,
"loss": 1.7447,
"step": 430
},
{
"epoch": 0.014216018868534134,
"grad_norm": 8.105657577514648,
"learning_rate": 2.2096091870840613e-05,
"loss": 1.7348,
"step": 440
},
{
"epoch": 0.014539110206455364,
"grad_norm": 9.129870414733887,
"learning_rate": 2.2080365311501466e-05,
"loss": 1.832,
"step": 450
},
{
"epoch": 0.014539110206455364,
"eval_loss": 1.738905906677246,
"eval_runtime": 51.3901,
"eval_samples_per_second": 9.749,
"eval_steps_per_second": 9.749,
"step": 450
},
{
"epoch": 0.014862201544376596,
"grad_norm": 7.760665416717529,
"learning_rate": 2.206419069480962e-05,
"loss": 1.6879,
"step": 460
},
{
"epoch": 0.015185292882297826,
"grad_norm": 11.239582061767578,
"learning_rate": 2.2047568685640212e-05,
"loss": 1.7155,
"step": 470
},
{
"epoch": 0.015508384220219057,
"grad_norm": 11.516200065612793,
"learning_rate": 2.203049996725894e-05,
"loss": 1.8277,
"step": 480
},
{
"epoch": 0.015831475558140285,
"grad_norm": 7.8999834060668945,
"learning_rate": 2.2012985241293954e-05,
"loss": 1.9313,
"step": 490
},
{
"epoch": 0.016154566896061515,
"grad_norm": 11.626410484313965,
"learning_rate": 2.1995025227707044e-05,
"loss": 1.8176,
"step": 500
},
{
"epoch": 0.016477658233982746,
"grad_norm": 10.57736873626709,
"learning_rate": 2.1976620664764027e-05,
"loss": 1.6413,
"step": 510
},
{
"epoch": 0.016800749571903976,
"grad_norm": 10.937850952148438,
"learning_rate": 2.1957772309004394e-05,
"loss": 1.7362,
"step": 520
},
{
"epoch": 0.017123840909825206,
"grad_norm": 12.991289138793945,
"learning_rate": 2.1938480935210228e-05,
"loss": 1.8159,
"step": 530
},
{
"epoch": 0.017446932247746436,
"grad_norm": 8.747649192810059,
"learning_rate": 2.1918747336374347e-05,
"loss": 1.8647,
"step": 540
},
{
"epoch": 0.017770023585667667,
"grad_norm": 7.261239051818848,
"learning_rate": 2.189857232366771e-05,
"loss": 1.8095,
"step": 550
},
{
"epoch": 0.0180931149235889,
"grad_norm": 11.28962516784668,
"learning_rate": 2.1877956726406063e-05,
"loss": 1.7219,
"step": 560
},
{
"epoch": 0.01841620626151013,
"grad_norm": 12.047504425048828,
"learning_rate": 2.1856901392015874e-05,
"loss": 1.9073,
"step": 570
},
{
"epoch": 0.01873929759943136,
"grad_norm": 13.037126541137695,
"learning_rate": 2.183540718599946e-05,
"loss": 1.8341,
"step": 580
},
{
"epoch": 0.01906238893735259,
"grad_norm": 13.498525619506836,
"learning_rate": 2.1813474991899453e-05,
"loss": 1.9031,
"step": 590
},
{
"epoch": 0.01938548027527382,
"grad_norm": 8.813241958618164,
"learning_rate": 2.1791105711262442e-05,
"loss": 1.6959,
"step": 600
},
{
"epoch": 0.01938548027527382,
"eval_loss": 1.7352592945098877,
"eval_runtime": 45.3637,
"eval_samples_per_second": 11.044,
"eval_steps_per_second": 11.044,
"step": 600
},
{
"epoch": 0.01970857161319505,
"grad_norm": 12.009414672851562,
"learning_rate": 2.1768300263601945e-05,
"loss": 1.9202,
"step": 610
},
{
"epoch": 0.02003166295111628,
"grad_norm": 9.502263069152832,
"learning_rate": 2.174505958636059e-05,
"loss": 1.8399,
"step": 620
},
{
"epoch": 0.020354754289037512,
"grad_norm": 8.802181243896484,
"learning_rate": 2.1721384634871592e-05,
"loss": 1.7752,
"step": 630
},
{
"epoch": 0.020677845626958742,
"grad_norm": 10.21849250793457,
"learning_rate": 2.169727638231948e-05,
"loss": 1.8093,
"step": 640
},
{
"epoch": 0.021000936964879972,
"grad_norm": 13.870856285095215,
"learning_rate": 2.1672735819700084e-05,
"loss": 1.7692,
"step": 650
},
{
"epoch": 0.021324028302801203,
"grad_norm": 8.835685729980469,
"learning_rate": 2.1647763955779823e-05,
"loss": 1.7946,
"step": 660
},
{
"epoch": 0.021647119640722433,
"grad_norm": 10.815114974975586,
"learning_rate": 2.1622361817054213e-05,
"loss": 1.7859,
"step": 670
},
{
"epoch": 0.021970210978643663,
"grad_norm": 7.781187057495117,
"learning_rate": 2.1596530447705676e-05,
"loss": 1.799,
"step": 680
},
{
"epoch": 0.022293302316564893,
"grad_norm": 7.180160999298096,
"learning_rate": 2.157027090956064e-05,
"loss": 1.8043,
"step": 690
},
{
"epoch": 0.022616393654486124,
"grad_norm": 10.69900131225586,
"learning_rate": 2.1543584282045862e-05,
"loss": 1.8702,
"step": 700
},
{
"epoch": 0.022939484992407354,
"grad_norm": 7.213955402374268,
"learning_rate": 2.1516471662144077e-05,
"loss": 1.7494,
"step": 710
},
{
"epoch": 0.023262576330328584,
"grad_norm": 7.767466068267822,
"learning_rate": 2.1488934164348898e-05,
"loss": 1.8502,
"step": 720
},
{
"epoch": 0.023585667668249814,
"grad_norm": 11.879374504089355,
"learning_rate": 2.1460972920619e-05,
"loss": 1.7991,
"step": 730
},
{
"epoch": 0.023908759006171044,
"grad_norm": 14.04179859161377,
"learning_rate": 2.143258908033159e-05,
"loss": 1.7676,
"step": 740
},
{
"epoch": 0.024231850344092275,
"grad_norm": 13.560479164123535,
"learning_rate": 2.140378381023518e-05,
"loss": 1.7701,
"step": 750
},
{
"epoch": 0.024231850344092275,
"eval_loss": 1.735255241394043,
"eval_runtime": 50.7259,
"eval_samples_per_second": 9.877,
"eval_steps_per_second": 9.877,
"step": 750
},
{
"epoch": 0.024554941682013505,
"grad_norm": 14.157264709472656,
"learning_rate": 2.1374558294401597e-05,
"loss": 1.6387,
"step": 760
},
{
"epoch": 0.024878033019934735,
"grad_norm": 9.316694259643555,
"learning_rate": 2.134491373417733e-05,
"loss": 1.6191,
"step": 770
},
{
"epoch": 0.025201124357855965,
"grad_norm": 11.018717765808105,
"learning_rate": 2.1314851348134134e-05,
"loss": 1.9314,
"step": 780
},
{
"epoch": 0.025524215695777196,
"grad_norm": 7.385425567626953,
"learning_rate": 2.1284372372018963e-05,
"loss": 1.8197,
"step": 790
},
{
"epoch": 0.025847307033698426,
"grad_norm": 6.388603210449219,
"learning_rate": 2.125347805870314e-05,
"loss": 1.6863,
"step": 800
},
{
"epoch": 0.026170398371619656,
"grad_norm": 8.44832706451416,
"learning_rate": 2.122216967813088e-05,
"loss": 1.9365,
"step": 810
},
{
"epoch": 0.026493489709540886,
"grad_norm": 9.323631286621094,
"learning_rate": 2.1190448517267087e-05,
"loss": 1.7366,
"step": 820
},
{
"epoch": 0.026816581047462117,
"grad_norm": 8.1749849319458,
"learning_rate": 2.115831588004444e-05,
"loss": 1.7957,
"step": 830
},
{
"epoch": 0.027139672385383347,
"grad_norm": 8.410055160522461,
"learning_rate": 2.1125773087309798e-05,
"loss": 1.7546,
"step": 840
},
{
"epoch": 0.027462763723304577,
"grad_norm": 9.194031715393066,
"learning_rate": 2.1092821476769906e-05,
"loss": 1.7421,
"step": 850
},
{
"epoch": 0.027785855061225807,
"grad_norm": 7.863668918609619,
"learning_rate": 2.1059462402936416e-05,
"loss": 1.8438,
"step": 860
},
{
"epoch": 0.028108946399147038,
"grad_norm": 8.896843910217285,
"learning_rate": 2.102569723707019e-05,
"loss": 1.9148,
"step": 870
},
{
"epoch": 0.028432037737068268,
"grad_norm": 12.01693058013916,
"learning_rate": 2.0991527367124955e-05,
"loss": 1.6743,
"step": 880
},
{
"epoch": 0.028755129074989498,
"grad_norm": 15.558412551879883,
"learning_rate": 2.095695419769022e-05,
"loss": 1.742,
"step": 890
},
{
"epoch": 0.02907822041291073,
"grad_norm": 9.519794464111328,
"learning_rate": 2.0921979149933576e-05,
"loss": 1.8417,
"step": 900
},
{
"epoch": 0.02907822041291073,
"eval_loss": 1.7262463569641113,
"eval_runtime": 46.9684,
"eval_samples_per_second": 10.667,
"eval_steps_per_second": 10.667,
"step": 900
},
{
"epoch": 0.02940131175083196,
"grad_norm": 11.452165603637695,
"learning_rate": 2.0886603661542245e-05,
"loss": 1.6859,
"step": 910
},
{
"epoch": 0.029724403088753192,
"grad_norm": 14.136131286621094,
"learning_rate": 2.0850829186663994e-05,
"loss": 1.7964,
"step": 920
},
{
"epoch": 0.030047494426674422,
"grad_norm": 10.132573127746582,
"learning_rate": 2.0814657195847375e-05,
"loss": 1.8617,
"step": 930
},
{
"epoch": 0.030370585764595653,
"grad_norm": 12.094213485717773,
"learning_rate": 2.077808917598125e-05,
"loss": 1.7776,
"step": 940
},
{
"epoch": 0.030693677102516883,
"grad_norm": 13.876291275024414,
"learning_rate": 2.0741126630233687e-05,
"loss": 1.874,
"step": 950
},
{
"epoch": 0.031016768440438113,
"grad_norm": 7.611670017242432,
"learning_rate": 2.070377107799017e-05,
"loss": 1.818,
"step": 960
},
{
"epoch": 0.03133985977835934,
"grad_norm": 7.113893985748291,
"learning_rate": 2.0666024054791137e-05,
"loss": 1.795,
"step": 970
},
{
"epoch": 0.03166295111628057,
"grad_norm": 9.59799861907959,
"learning_rate": 2.0627887112268875e-05,
"loss": 1.7043,
"step": 980
},
{
"epoch": 0.0319860424542018,
"grad_norm": 9.458488464355469,
"learning_rate": 2.0589361818083712e-05,
"loss": 1.8802,
"step": 990
},
{
"epoch": 0.03230913379212303,
"grad_norm": 18.441673278808594,
"learning_rate": 2.0550449755859598e-05,
"loss": 1.7685,
"step": 1000
},
{
"epoch": 0.03263222513004426,
"grad_norm": 10.183686256408691,
"learning_rate": 2.0511152525119014e-05,
"loss": 1.7086,
"step": 1010
},
{
"epoch": 0.03295531646796549,
"grad_norm": 10.062090873718262,
"learning_rate": 2.0471471741217183e-05,
"loss": 1.6957,
"step": 1020
},
{
"epoch": 0.03327840780588672,
"grad_norm": 13.140121459960938,
"learning_rate": 2.0431409035275724e-05,
"loss": 1.8548,
"step": 1030
},
{
"epoch": 0.03360149914380795,
"grad_norm": 12.769956588745117,
"learning_rate": 2.0390966054115558e-05,
"loss": 1.8432,
"step": 1040
},
{
"epoch": 0.03392459048172918,
"grad_norm": 8.896726608276367,
"learning_rate": 2.035014446018924e-05,
"loss": 1.8244,
"step": 1050
},
{
"epoch": 0.03392459048172918,
"eval_loss": 1.732542634010315,
"eval_runtime": 48.4752,
"eval_samples_per_second": 10.335,
"eval_steps_per_second": 10.335,
"step": 1050
},
{
"epoch": 0.03424768181965041,
"grad_norm": 9.040741920471191,
"learning_rate": 2.0308945931512606e-05,
"loss": 1.8755,
"step": 1060
},
{
"epoch": 0.03457077315757164,
"grad_norm": 10.011199951171875,
"learning_rate": 2.0267372161595806e-05,
"loss": 1.9373,
"step": 1070
},
{
"epoch": 0.03489386449549287,
"grad_norm": 11.096688270568848,
"learning_rate": 2.022542485937369e-05,
"loss": 1.7546,
"step": 1080
},
{
"epoch": 0.0352169558334141,
"grad_norm": 8.579634666442871,
"learning_rate": 2.0183105749135553e-05,
"loss": 1.7737,
"step": 1090
},
{
"epoch": 0.03554004717133533,
"grad_norm": 12.04051685333252,
"learning_rate": 2.0140416570454266e-05,
"loss": 1.7295,
"step": 1100
},
{
"epoch": 0.03586313850925657,
"grad_norm": 8.008216857910156,
"learning_rate": 2.0097359078114767e-05,
"loss": 1.7887,
"step": 1110
},
{
"epoch": 0.0361862298471778,
"grad_norm": 10.406676292419434,
"learning_rate": 2.0053935042041915e-05,
"loss": 1.6403,
"step": 1120
},
{
"epoch": 0.03650932118509903,
"grad_norm": 13.819786071777344,
"learning_rate": 2.001014624722775e-05,
"loss": 1.6146,
"step": 1130
},
{
"epoch": 0.03683241252302026,
"grad_norm": 8.4624605178833,
"learning_rate": 1.996599449365813e-05,
"loss": 1.7913,
"step": 1140
},
{
"epoch": 0.03715550386094149,
"grad_norm": 8.123451232910156,
"learning_rate": 1.9921481596238703e-05,
"loss": 1.735,
"step": 1150
},
{
"epoch": 0.03747859519886272,
"grad_norm": 10.989400863647461,
"learning_rate": 1.9876609384720335e-05,
"loss": 1.8681,
"step": 1160
},
{
"epoch": 0.03780168653678395,
"grad_norm": 10.746498107910156,
"learning_rate": 1.9831379703623903e-05,
"loss": 1.6177,
"step": 1170
},
{
"epoch": 0.03812477787470518,
"grad_norm": 10.412644386291504,
"learning_rate": 1.978579441216443e-05,
"loss": 1.8265,
"step": 1180
},
{
"epoch": 0.03844786921262641,
"grad_norm": 8.273632049560547,
"learning_rate": 1.9739855384174708e-05,
"loss": 1.7172,
"step": 1190
},
{
"epoch": 0.03877096055054764,
"grad_norm": 9.549877166748047,
"learning_rate": 1.969356450802825e-05,
"loss": 1.618,
"step": 1200
},
{
"epoch": 0.03877096055054764,
"eval_loss": 1.733805775642395,
"eval_runtime": 50.1723,
"eval_samples_per_second": 9.986,
"eval_steps_per_second": 9.986,
"step": 1200
},
{
"epoch": 0.03909405188846887,
"grad_norm": 11.008418083190918,
"learning_rate": 1.964692368656166e-05,
"loss": 1.7639,
"step": 1210
},
{
"epoch": 0.0394171432263901,
"grad_norm": 9.95077133178711,
"learning_rate": 1.9599934836996435e-05,
"loss": 1.6824,
"step": 1220
},
{
"epoch": 0.03974023456431133,
"grad_norm": 9.953926086425781,
"learning_rate": 1.9552599890860126e-05,
"loss": 1.7993,
"step": 1230
},
{
"epoch": 0.04006332590223256,
"grad_norm": 8.160842895507812,
"learning_rate": 1.9504920793906985e-05,
"loss": 1.6618,
"step": 1240
},
{
"epoch": 0.040386417240153794,
"grad_norm": 10.420413970947266,
"learning_rate": 1.945689950603793e-05,
"loss": 1.774,
"step": 1250
},
{
"epoch": 0.040709508578075024,
"grad_norm": 6.669633388519287,
"learning_rate": 1.9408538001220032e-05,
"loss": 1.6355,
"step": 1260
},
{
"epoch": 0.041032599915996254,
"grad_norm": 9.326558113098145,
"learning_rate": 1.9359838267405318e-05,
"loss": 1.644,
"step": 1270
},
{
"epoch": 0.041355691253917484,
"grad_norm": 9.282320976257324,
"learning_rate": 1.931080230644911e-05,
"loss": 1.7776,
"step": 1280
},
{
"epoch": 0.041678782591838714,
"grad_norm": 7.603001594543457,
"learning_rate": 1.926143213402771e-05,
"loss": 1.7819,
"step": 1290
},
{
"epoch": 0.042001873929759945,
"grad_norm": 7.637514114379883,
"learning_rate": 1.921172977955552e-05,
"loss": 1.7667,
"step": 1300
},
{
"epoch": 0.042324965267681175,
"grad_norm": 9.193405151367188,
"learning_rate": 1.9161697286101677e-05,
"loss": 1.8003,
"step": 1310
},
{
"epoch": 0.042648056605602405,
"grad_norm": 7.766101837158203,
"learning_rate": 1.9111336710306013e-05,
"loss": 1.5436,
"step": 1320
},
{
"epoch": 0.042971147943523635,
"grad_norm": 9.50290298461914,
"learning_rate": 1.9060650122294554e-05,
"loss": 1.7709,
"step": 1330
},
{
"epoch": 0.043294239281444866,
"grad_norm": 10.983290672302246,
"learning_rate": 1.9009639605594407e-05,
"loss": 1.7847,
"step": 1340
},
{
"epoch": 0.043617330619366096,
"grad_norm": 9.261622428894043,
"learning_rate": 1.8958307257048116e-05,
"loss": 1.6588,
"step": 1350
},
{
"epoch": 0.043617330619366096,
"eval_loss": 1.7271850109100342,
"eval_runtime": 52.5068,
"eval_samples_per_second": 9.542,
"eval_steps_per_second": 9.542,
"step": 1350
},
{
"epoch": 0.043940421957287326,
"grad_norm": 13.047907829284668,
"learning_rate": 1.890665518672748e-05,
"loss": 1.6819,
"step": 1360
},
{
"epoch": 0.044263513295208556,
"grad_norm": 10.242769241333008,
"learning_rate": 1.88546855178468e-05,
"loss": 1.7274,
"step": 1370
},
{
"epoch": 0.04458660463312979,
"grad_norm": 10.755792617797852,
"learning_rate": 1.880240038667561e-05,
"loss": 1.8454,
"step": 1380
},
{
"epoch": 0.04490969597105102,
"grad_norm": 8.716181755065918,
"learning_rate": 1.874980194245087e-05,
"loss": 1.6762,
"step": 1390
},
{
"epoch": 0.04523278730897225,
"grad_norm": 11.95937728881836,
"learning_rate": 1.8696892347288606e-05,
"loss": 1.6698,
"step": 1400
},
{
"epoch": 0.04555587864689348,
"grad_norm": 9.280597686767578,
"learning_rate": 1.864367377609504e-05,
"loss": 1.7899,
"step": 1410
},
{
"epoch": 0.04587896998481471,
"grad_norm": 9.891837120056152,
"learning_rate": 1.8590148416477198e-05,
"loss": 1.8089,
"step": 1420
},
{
"epoch": 0.04620206132273594,
"grad_norm": 16.32816505432129,
"learning_rate": 1.8536318468652962e-05,
"loss": 1.6734,
"step": 1430
},
{
"epoch": 0.04652515266065717,
"grad_norm": 8.544951438903809,
"learning_rate": 1.8482186145360648e-05,
"loss": 1.7713,
"step": 1440
},
{
"epoch": 0.0468482439985784,
"grad_norm": 10.275934219360352,
"learning_rate": 1.8427753671768056e-05,
"loss": 1.7716,
"step": 1450
},
{
"epoch": 0.04717133533649963,
"grad_norm": 11.277603149414062,
"learning_rate": 1.8373023285380966e-05,
"loss": 1.7318,
"step": 1460
},
{
"epoch": 0.04749442667442086,
"grad_norm": 8.100369453430176,
"learning_rate": 1.8317997235951204e-05,
"loss": 1.7877,
"step": 1470
},
{
"epoch": 0.04781751801234209,
"grad_norm": 9.99820613861084,
"learning_rate": 1.8262677785384142e-05,
"loss": 1.8218,
"step": 1480
},
{
"epoch": 0.04814060935026332,
"grad_norm": 12.380565643310547,
"learning_rate": 1.8207067207645716e-05,
"loss": 1.8257,
"step": 1490
},
{
"epoch": 0.04846370068818455,
"grad_norm": 9.522893905639648,
"learning_rate": 1.815116778866897e-05,
"loss": 1.9182,
"step": 1500
},
{
"epoch": 0.04846370068818455,
"eval_loss": 1.7235751152038574,
"eval_runtime": 52.6329,
"eval_samples_per_second": 9.519,
"eval_steps_per_second": 9.519,
"step": 1500
},
{
"epoch": 0.04878679202610578,
"grad_norm": 10.540502548217773,
"learning_rate": 1.8094981826260064e-05,
"loss": 1.787,
"step": 1510
},
{
"epoch": 0.04910988336402701,
"grad_norm": 18.479272842407227,
"learning_rate": 1.8038511630003865e-05,
"loss": 1.6268,
"step": 1520
},
{
"epoch": 0.04943297470194824,
"grad_norm": 8.567608833312988,
"learning_rate": 1.798175952116895e-05,
"loss": 1.8189,
"step": 1530
},
{
"epoch": 0.04975606603986947,
"grad_norm": 8.42313003540039,
"learning_rate": 1.7924727832612227e-05,
"loss": 1.6656,
"step": 1540
},
{
"epoch": 0.0500791573777907,
"grad_norm": 11.321965217590332,
"learning_rate": 1.786741890868305e-05,
"loss": 1.8461,
"step": 1550
},
{
"epoch": 0.05040224871571193,
"grad_norm": 13.762293815612793,
"learning_rate": 1.7809835105126807e-05,
"loss": 1.6322,
"step": 1560
},
{
"epoch": 0.05072534005363316,
"grad_norm": 10.186868667602539,
"learning_rate": 1.7751978788988123e-05,
"loss": 1.7285,
"step": 1570
},
{
"epoch": 0.05104843139155439,
"grad_norm": 12.446249008178711,
"learning_rate": 1.7693852338513545e-05,
"loss": 1.7648,
"step": 1580
},
{
"epoch": 0.05137152272947562,
"grad_norm": 12.205934524536133,
"learning_rate": 1.7635458143053794e-05,
"loss": 1.8282,
"step": 1590
},
{
"epoch": 0.05169461406739685,
"grad_norm": 11.627687454223633,
"learning_rate": 1.7576798602965525e-05,
"loss": 1.7931,
"step": 1600
},
{
"epoch": 0.05201770540531808,
"grad_norm": 9.943934440612793,
"learning_rate": 1.7517876129512677e-05,
"loss": 1.8118,
"step": 1610
},
{
"epoch": 0.05234079674323931,
"grad_norm": 9.162341117858887,
"learning_rate": 1.7458693144767353e-05,
"loss": 2.0104,
"step": 1620
},
{
"epoch": 0.05266388808116054,
"grad_norm": 11.957560539245605,
"learning_rate": 1.7399252081510248e-05,
"loss": 1.7413,
"step": 1630
},
{
"epoch": 0.05298697941908177,
"grad_norm": 10.325530052185059,
"learning_rate": 1.733955538313066e-05,
"loss": 1.9299,
"step": 1640
},
{
"epoch": 0.053310070757003,
"grad_norm": 13.011626243591309,
"learning_rate": 1.7279605503526047e-05,
"loss": 1.8611,
"step": 1650
},
{
"epoch": 0.053310070757003,
"eval_loss": 1.7197445631027222,
"eval_runtime": 50.0407,
"eval_samples_per_second": 10.012,
"eval_steps_per_second": 10.012,
"step": 1650
},
{
"epoch": 0.05363316209492423,
"grad_norm": 9.317269325256348,
"learning_rate": 1.721940490700115e-05,
"loss": 1.8115,
"step": 1660
},
{
"epoch": 0.053956253432845463,
"grad_norm": 9.647391319274902,
"learning_rate": 1.7158956068166697e-05,
"loss": 1.8057,
"step": 1670
},
{
"epoch": 0.054279344770766694,
"grad_norm": 13.944653511047363,
"learning_rate": 1.7098261471837696e-05,
"loss": 1.732,
"step": 1680
},
{
"epoch": 0.054602436108687924,
"grad_norm": 14.09939956665039,
"learning_rate": 1.7037323612931272e-05,
"loss": 1.6199,
"step": 1690
},
{
"epoch": 0.054925527446609154,
"grad_norm": 15.80185604095459,
"learning_rate": 1.697614499636414e-05,
"loss": 1.727,
"step": 1700
},
{
"epoch": 0.055248618784530384,
"grad_norm": 11.975440979003906,
"learning_rate": 1.6914728136949594e-05,
"loss": 1.5241,
"step": 1710
},
{
"epoch": 0.055571710122451615,
"grad_norm": 9.165396690368652,
"learning_rate": 1.6853075559294172e-05,
"loss": 1.835,
"step": 1720
},
{
"epoch": 0.055894801460372845,
"grad_norm": 6.48829460144043,
"learning_rate": 1.6791189797693877e-05,
"loss": 1.8819,
"step": 1730
},
{
"epoch": 0.056217892798294075,
"grad_norm": 9.731879234313965,
"learning_rate": 1.6729073396029965e-05,
"loss": 1.7085,
"step": 1740
},
{
"epoch": 0.056540984136215305,
"grad_norm": 8.751827239990234,
"learning_rate": 1.666672890766442e-05,
"loss": 1.7571,
"step": 1750
},
{
"epoch": 0.056864075474136536,
"grad_norm": 7.602952480316162,
"learning_rate": 1.660415889533497e-05,
"loss": 1.732,
"step": 1760
},
{
"epoch": 0.057187166812057766,
"grad_norm": 7.001266002655029,
"learning_rate": 1.6541365931049757e-05,
"loss": 1.7523,
"step": 1770
},
{
"epoch": 0.057510258149978996,
"grad_norm": 9.747156143188477,
"learning_rate": 1.6478352595981594e-05,
"loss": 1.7293,
"step": 1780
},
{
"epoch": 0.057833349487900226,
"grad_norm": 9.404924392700195,
"learning_rate": 1.6415121480361884e-05,
"loss": 1.7708,
"step": 1790
},
{
"epoch": 0.05815644082582146,
"grad_norm": 8.594147682189941,
"learning_rate": 1.635167518337413e-05,
"loss": 1.6754,
"step": 1800
},
{
"epoch": 0.05815644082582146,
"eval_loss": 1.7189382314682007,
"eval_runtime": 49.5978,
"eval_samples_per_second": 10.101,
"eval_steps_per_second": 10.101,
"step": 1800
},
{
"epoch": 0.05847953216374269,
"grad_norm": 10.03783893585205,
"learning_rate": 1.6288016313047095e-05,
"loss": 1.7414,
"step": 1810
},
{
"epoch": 0.05880262350166392,
"grad_norm": 11.099542617797852,
"learning_rate": 1.6224147486147602e-05,
"loss": 1.8542,
"step": 1820
},
{
"epoch": 0.05912571483958515,
"grad_norm": 10.364195823669434,
"learning_rate": 1.616007132807298e-05,
"loss": 1.7869,
"step": 1830
},
{
"epoch": 0.059448806177506384,
"grad_norm": 8.265610694885254,
"learning_rate": 1.6095790472743107e-05,
"loss": 1.7011,
"step": 1840
},
{
"epoch": 0.059771897515427615,
"grad_norm": 13.766877174377441,
"learning_rate": 1.6031307562492174e-05,
"loss": 1.6642,
"step": 1850
},
{
"epoch": 0.060094988853348845,
"grad_norm": 6.757899761199951,
"learning_rate": 1.5966625247960068e-05,
"loss": 1.7962,
"step": 1860
},
{
"epoch": 0.060418080191270075,
"grad_norm": 10.730036735534668,
"learning_rate": 1.5901746187983387e-05,
"loss": 1.6452,
"step": 1870
},
{
"epoch": 0.060741171529191305,
"grad_norm": 7.790616035461426,
"learning_rate": 1.5836673049486175e-05,
"loss": 1.7794,
"step": 1880
},
{
"epoch": 0.061064262867112536,
"grad_norm": 10.735581398010254,
"learning_rate": 1.577140850737029e-05,
"loss": 1.7813,
"step": 1890
},
{
"epoch": 0.061387354205033766,
"grad_norm": 14.150415420532227,
"learning_rate": 1.5705955244405423e-05,
"loss": 1.6433,
"step": 1900
},
{
"epoch": 0.061710445542954996,
"grad_norm": 9.497598648071289,
"learning_rate": 1.564031595111886e-05,
"loss": 1.6311,
"step": 1910
},
{
"epoch": 0.062033536880876226,
"grad_norm": 10.464118003845215,
"learning_rate": 1.557449332568485e-05,
"loss": 1.7038,
"step": 1920
},
{
"epoch": 0.06235662821879746,
"grad_norm": 10.506887435913086,
"learning_rate": 1.5508490073813722e-05,
"loss": 1.7814,
"step": 1930
},
{
"epoch": 0.06267971955671868,
"grad_norm": 17.466184616088867,
"learning_rate": 1.5442308908640636e-05,
"loss": 1.6902,
"step": 1940
},
{
"epoch": 0.06300281089463991,
"grad_norm": 10.4354829788208,
"learning_rate": 1.537595255061408e-05,
"loss": 1.7127,
"step": 1950
},
{
"epoch": 0.06300281089463991,
"eval_loss": 1.713944435119629,
"eval_runtime": 51.0974,
"eval_samples_per_second": 9.805,
"eval_steps_per_second": 9.805,
"step": 1950
},
{
"epoch": 0.06332590223256114,
"grad_norm": 9.260307312011719,
"learning_rate": 1.5309423727384037e-05,
"loss": 1.6463,
"step": 1960
},
{
"epoch": 0.06364899357048237,
"grad_norm": 9.762173652648926,
"learning_rate": 1.5242725173689851e-05,
"loss": 1.7202,
"step": 1970
},
{
"epoch": 0.0639720849084036,
"grad_norm": 16.75164222717285,
"learning_rate": 1.5175859631247827e-05,
"loss": 1.6976,
"step": 1980
},
{
"epoch": 0.06429517624632483,
"grad_norm": 7.676208972930908,
"learning_rate": 1.5108829848638515e-05,
"loss": 1.7599,
"step": 1990
},
{
"epoch": 0.06461826758424606,
"grad_norm": 9.695477485656738,
"learning_rate": 1.5041638581193741e-05,
"loss": 1.6792,
"step": 2000
},
{
"epoch": 0.06494135892216729,
"grad_norm": 16.161441802978516,
"learning_rate": 1.4974288590883346e-05,
"loss": 1.8177,
"step": 2010
},
{
"epoch": 0.06526445026008852,
"grad_norm": 11.451770782470703,
"learning_rate": 1.4906782646201634e-05,
"loss": 1.7105,
"step": 2020
},
{
"epoch": 0.06558754159800975,
"grad_norm": 10.997443199157715,
"learning_rate": 1.4839123522053591e-05,
"loss": 1.7168,
"step": 2030
},
{
"epoch": 0.06591063293593098,
"grad_norm": 6.7931132316589355,
"learning_rate": 1.4771313999640806e-05,
"loss": 1.769,
"step": 2040
},
{
"epoch": 0.06623372427385221,
"grad_norm": 9.296013832092285,
"learning_rate": 1.4703356866347155e-05,
"loss": 1.8983,
"step": 2050
},
{
"epoch": 0.06655681561177344,
"grad_norm": 10.765667915344238,
"learning_rate": 1.4635254915624214e-05,
"loss": 1.6917,
"step": 2060
},
{
"epoch": 0.06687990694969467,
"grad_norm": 11.256792068481445,
"learning_rate": 1.4567010946876445e-05,
"loss": 1.7493,
"step": 2070
},
{
"epoch": 0.0672029982876159,
"grad_norm": 9.741044998168945,
"learning_rate": 1.4498627765346109e-05,
"loss": 1.8623,
"step": 2080
},
{
"epoch": 0.06752608962553713,
"grad_norm": 8.61628532409668,
"learning_rate": 1.4430108181997962e-05,
"loss": 1.7821,
"step": 2090
},
{
"epoch": 0.06784918096345836,
"grad_norm": 6.974494934082031,
"learning_rate": 1.4361455013403695e-05,
"loss": 1.7919,
"step": 2100
},
{
"epoch": 0.06784918096345836,
"eval_loss": 1.7191635370254517,
"eval_runtime": 48.5314,
"eval_samples_per_second": 10.323,
"eval_steps_per_second": 10.323,
"step": 2100
},
{
"epoch": 0.0681722723013796,
"grad_norm": 10.361462593078613,
"learning_rate": 1.4292671081626183e-05,
"loss": 1.7856,
"step": 2110
},
{
"epoch": 0.06849536363930082,
"grad_norm": 11.415956497192383,
"learning_rate": 1.4223759214103443e-05,
"loss": 1.8235,
"step": 2120
},
{
"epoch": 0.06881845497722205,
"grad_norm": 10.382593154907227,
"learning_rate": 1.4154722243532445e-05,
"loss": 1.7848,
"step": 2130
},
{
"epoch": 0.06914154631514328,
"grad_norm": 8.885568618774414,
"learning_rate": 1.4085563007752654e-05,
"loss": 1.7903,
"step": 2140
},
{
"epoch": 0.06946463765306451,
"grad_norm": 8.006834030151367,
"learning_rate": 1.4016284349629364e-05,
"loss": 1.7024,
"step": 2150
},
{
"epoch": 0.06978772899098575,
"grad_norm": 10.482210159301758,
"learning_rate": 1.3946889116936874e-05,
"loss": 1.7564,
"step": 2160
},
{
"epoch": 0.07011082032890698,
"grad_norm": 7.895102500915527,
"learning_rate": 1.3877380162241394e-05,
"loss": 1.5733,
"step": 2170
},
{
"epoch": 0.0704339116668282,
"grad_norm": 8.805550575256348,
"learning_rate": 1.3807760342783804e-05,
"loss": 1.6728,
"step": 2180
},
{
"epoch": 0.07075700300474944,
"grad_norm": 7.979333400726318,
"learning_rate": 1.37380325203622e-05,
"loss": 1.7731,
"step": 2190
},
{
"epoch": 0.07108009434267067,
"grad_norm": 9.50521469116211,
"learning_rate": 1.3668199561214252e-05,
"loss": 1.7482,
"step": 2200
},
{
"epoch": 0.07140318568059191,
"grad_norm": 7.414913177490234,
"learning_rate": 1.35982643358994e-05,
"loss": 1.7075,
"step": 2210
},
{
"epoch": 0.07172627701851314,
"grad_norm": 8.384417533874512,
"learning_rate": 1.3528229719180835e-05,
"loss": 1.8332,
"step": 2220
},
{
"epoch": 0.07204936835643437,
"grad_norm": 9.352381706237793,
"learning_rate": 1.3458098589907348e-05,
"loss": 1.806,
"step": 2230
},
{
"epoch": 0.0723724596943556,
"grad_norm": 12.371936798095703,
"learning_rate": 1.3387873830894973e-05,
"loss": 1.8181,
"step": 2240
},
{
"epoch": 0.07269555103227683,
"grad_norm": 8.478230476379395,
"learning_rate": 1.3317558328808506e-05,
"loss": 1.7851,
"step": 2250
},
{
"epoch": 0.07269555103227683,
"eval_loss": 1.712678074836731,
"eval_runtime": 50.8101,
"eval_samples_per_second": 9.86,
"eval_steps_per_second": 9.86,
"step": 2250
},
{
"epoch": 0.07301864237019806,
"grad_norm": 7.574065685272217,
"learning_rate": 1.3247154974042827e-05,
"loss": 1.762,
"step": 2260
},
{
"epoch": 0.07334173370811929,
"grad_norm": 11.433499336242676,
"learning_rate": 1.3176666660604102e-05,
"loss": 1.834,
"step": 2270
},
{
"epoch": 0.07366482504604052,
"grad_norm": 7.498250484466553,
"learning_rate": 1.3106096285990812e-05,
"loss": 1.8071,
"step": 2280
},
{
"epoch": 0.07398791638396175,
"grad_norm": 7.674075126647949,
"learning_rate": 1.3035446751074653e-05,
"loss": 1.7767,
"step": 2290
},
{
"epoch": 0.07431100772188298,
"grad_norm": 10.668132781982422,
"learning_rate": 1.2964720959981287e-05,
"loss": 1.5325,
"step": 2300
},
{
"epoch": 0.07463409905980421,
"grad_norm": 8.854166984558105,
"learning_rate": 1.2893921819970972e-05,
"loss": 1.845,
"step": 2310
},
{
"epoch": 0.07495719039772544,
"grad_norm": 6.754697322845459,
"learning_rate": 1.2823052241319061e-05,
"loss": 1.739,
"step": 2320
},
{
"epoch": 0.07528028173564667,
"grad_norm": 7.947122573852539,
"learning_rate": 1.2752115137196341e-05,
"loss": 1.6849,
"step": 2330
},
{
"epoch": 0.0756033730735679,
"grad_norm": 12.404890060424805,
"learning_rate": 1.2681113423549334e-05,
"loss": 1.784,
"step": 2340
},
{
"epoch": 0.07592646441148913,
"grad_norm": 8.453229904174805,
"learning_rate": 1.2610050018980385e-05,
"loss": 1.6999,
"step": 2350
},
{
"epoch": 0.07624955574941036,
"grad_norm": 7.385756015777588,
"learning_rate": 1.2538927844627726e-05,
"loss": 1.6897,
"step": 2360
},
{
"epoch": 0.0765726470873316,
"grad_norm": 9.446147918701172,
"learning_rate": 1.2467749824045373e-05,
"loss": 1.8576,
"step": 2370
},
{
"epoch": 0.07689573842525282,
"grad_norm": 7.881250858306885,
"learning_rate": 1.2396518883082966e-05,
"loss": 1.6776,
"step": 2380
},
{
"epoch": 0.07721882976317405,
"grad_norm": 11.96408462524414,
"learning_rate": 1.2325237949765496e-05,
"loss": 1.791,
"step": 2390
},
{
"epoch": 0.07754192110109528,
"grad_norm": 10.829449653625488,
"learning_rate": 1.225390995417295e-05,
"loss": 1.7245,
"step": 2400
},
{
"epoch": 0.07754192110109528,
"eval_loss": 1.7104594707489014,
"eval_runtime": 50.4698,
"eval_samples_per_second": 9.927,
"eval_steps_per_second": 9.927,
"step": 2400
},
{
"epoch": 0.07786501243901651,
"grad_norm": 10.396730422973633,
"learning_rate": 1.2182537828319848e-05,
"loss": 1.8555,
"step": 2410
},
{
"epoch": 0.07818810377693775,
"grad_norm": 9.35103702545166,
"learning_rate": 1.2111124506034739e-05,
"loss": 1.8687,
"step": 2420
},
{
"epoch": 0.07851119511485898,
"grad_norm": 8.117349624633789,
"learning_rate": 1.2039672922839598e-05,
"loss": 1.7337,
"step": 2430
},
{
"epoch": 0.0788342864527802,
"grad_norm": 7.560978889465332,
"learning_rate": 1.196818601582915e-05,
"loss": 1.6828,
"step": 2440
},
{
"epoch": 0.07915737779070144,
"grad_norm": 7.487736225128174,
"learning_rate": 1.189666672355015e-05,
"loss": 1.7545,
"step": 2450
},
{
"epoch": 0.07948046912862267,
"grad_norm": 10.839227676391602,
"learning_rate": 1.1825117985880576e-05,
"loss": 1.749,
"step": 2460
},
{
"epoch": 0.0798035604665439,
"grad_norm": 8.697480201721191,
"learning_rate": 1.1753542743908802e-05,
"loss": 1.7122,
"step": 2470
},
{
"epoch": 0.08012665180446513,
"grad_norm": 11.060961723327637,
"learning_rate": 1.1681943939812688e-05,
"loss": 1.6293,
"step": 2480
},
{
"epoch": 0.08044974314238636,
"grad_norm": 8.913187980651855,
"learning_rate": 1.1610324516738626e-05,
"loss": 1.7505,
"step": 2490
},
{
"epoch": 0.08077283448030759,
"grad_norm": 6.877689361572266,
"learning_rate": 1.1538687418680596e-05,
"loss": 1.4006,
"step": 2500
},
{
"epoch": 0.08109592581822882,
"grad_norm": 11.248068809509277,
"learning_rate": 1.1467035590359106e-05,
"loss": 1.7784,
"step": 2510
},
{
"epoch": 0.08141901715615005,
"grad_norm": 10.510488510131836,
"learning_rate": 1.139537197710018e-05,
"loss": 1.7669,
"step": 2520
},
{
"epoch": 0.08174210849407128,
"grad_norm": 14.91984748840332,
"learning_rate": 1.1323699524714278e-05,
"loss": 1.6776,
"step": 2530
},
{
"epoch": 0.08206519983199251,
"grad_norm": 8.973823547363281,
"learning_rate": 1.1252021179375192e-05,
"loss": 1.8215,
"step": 2540
},
{
"epoch": 0.08238829116991374,
"grad_norm": 8.837857246398926,
"learning_rate": 1.118033988749895e-05,
"loss": 1.69,
"step": 2550
},
{
"epoch": 0.08238829116991374,
"eval_loss": 1.7076596021652222,
"eval_runtime": 44.7857,
"eval_samples_per_second": 11.187,
"eval_steps_per_second": 11.187,
"step": 2550
},
{
"epoch": 0.08271138250783497,
"grad_norm": 12.05571174621582,
"learning_rate": 1.1108658595622709e-05,
"loss": 1.7875,
"step": 2560
},
{
"epoch": 0.0830344738457562,
"grad_norm": 17.358158111572266,
"learning_rate": 1.1036980250283621e-05,
"loss": 1.7574,
"step": 2570
},
{
"epoch": 0.08335756518367743,
"grad_norm": 7.721527576446533,
"learning_rate": 1.096530779789772e-05,
"loss": 1.7518,
"step": 2580
},
{
"epoch": 0.08368065652159866,
"grad_norm": 11.574057579040527,
"learning_rate": 1.0893644184638797e-05,
"loss": 1.7236,
"step": 2590
},
{
"epoch": 0.08400374785951989,
"grad_norm": 10.338488578796387,
"learning_rate": 1.0821992356317307e-05,
"loss": 1.8056,
"step": 2600
},
{
"epoch": 0.08432683919744112,
"grad_norm": 9.807657241821289,
"learning_rate": 1.0750355258259273e-05,
"loss": 1.7627,
"step": 2610
},
{
"epoch": 0.08464993053536235,
"grad_norm": 7.947932720184326,
"learning_rate": 1.0678735835185219e-05,
"loss": 1.805,
"step": 2620
},
{
"epoch": 0.08497302187328358,
"grad_norm": 9.967132568359375,
"learning_rate": 1.06071370310891e-05,
"loss": 1.8368,
"step": 2630
},
{
"epoch": 0.08529611321120481,
"grad_norm": 10.970772743225098,
"learning_rate": 1.0535561789117327e-05,
"loss": 1.7216,
"step": 2640
},
{
"epoch": 0.08561920454912604,
"grad_norm": 8.452691078186035,
"learning_rate": 1.0464013051447755e-05,
"loss": 1.6741,
"step": 2650
},
{
"epoch": 0.08594229588704727,
"grad_norm": 10.103203773498535,
"learning_rate": 1.0392493759168751e-05,
"loss": 1.7487,
"step": 2660
},
{
"epoch": 0.0862653872249685,
"grad_norm": 9.015948295593262,
"learning_rate": 1.0321006852158306e-05,
"loss": 1.7332,
"step": 2670
},
{
"epoch": 0.08658847856288973,
"grad_norm": 6.827271938323975,
"learning_rate": 1.0249555268963164e-05,
"loss": 1.8094,
"step": 2680
},
{
"epoch": 0.08691156990081096,
"grad_norm": 10.552309036254883,
"learning_rate": 1.0178141946678054e-05,
"loss": 1.7288,
"step": 2690
},
{
"epoch": 0.08723466123873219,
"grad_norm": 7.483726501464844,
"learning_rate": 1.0106769820824951e-05,
"loss": 1.7407,
"step": 2700
},
{
"epoch": 0.08723466123873219,
"eval_loss": 1.7069541215896606,
"eval_runtime": 50.7115,
"eval_samples_per_second": 9.879,
"eval_steps_per_second": 9.879,
"step": 2700
},
{
"epoch": 0.08755775257665342,
"grad_norm": 7.673255920410156,
"learning_rate": 1.0035441825232406e-05,
"loss": 1.7783,
"step": 2710
},
{
"epoch": 0.08788084391457465,
"grad_norm": 7.7522382736206055,
"learning_rate": 9.964160891914937e-06,
"loss": 1.7612,
"step": 2720
},
{
"epoch": 0.08820393525249588,
"grad_norm": 10.859241485595703,
"learning_rate": 9.892929950952532e-06,
"loss": 1.713,
"step": 2730
},
{
"epoch": 0.08852702659041711,
"grad_norm": 8.127120971679688,
"learning_rate": 9.821751930370177e-06,
"loss": 1.7304,
"step": 2740
},
{
"epoch": 0.08885011792833834,
"grad_norm": 9.909963607788086,
"learning_rate": 9.750629756017514e-06,
"loss": 1.6213,
"step": 2750
},
{
"epoch": 0.08917320926625957,
"grad_norm": 10.604835510253906,
"learning_rate": 9.679566351448571e-06,
"loss": 1.6823,
"step": 2760
},
{
"epoch": 0.0894963006041808,
"grad_norm": 16.078346252441406,
"learning_rate": 9.608564637801562e-06,
"loss": 1.7492,
"step": 2770
},
{
"epoch": 0.08981939194210203,
"grad_norm": 12.718731880187988,
"learning_rate": 9.537627533678842e-06,
"loss": 1.7284,
"step": 2780
},
{
"epoch": 0.09014248328002326,
"grad_norm": 7.692546367645264,
"learning_rate": 9.466757955026925e-06,
"loss": 1.7821,
"step": 2790
},
{
"epoch": 0.0904655746179445,
"grad_norm": 10.689606666564941,
"learning_rate": 9.395958815016618e-06,
"loss": 1.6207,
"step": 2800
},
{
"epoch": 0.09078866595586572,
"grad_norm": 8.80530071258545,
"learning_rate": 9.325233023923252e-06,
"loss": 1.8065,
"step": 2810
},
{
"epoch": 0.09111175729378695,
"grad_norm": 9.406537055969238,
"learning_rate": 9.25458348900709e-06,
"loss": 1.678,
"step": 2820
},
{
"epoch": 0.09143484863170818,
"grad_norm": 10.812992095947266,
"learning_rate": 9.1840131143938e-06,
"loss": 1.6795,
"step": 2830
},
{
"epoch": 0.09175793996962942,
"grad_norm": 7.725828170776367,
"learning_rate": 9.113524800955074e-06,
"loss": 1.7043,
"step": 2840
},
{
"epoch": 0.09208103130755065,
"grad_norm": 7.820013046264648,
"learning_rate": 9.043121446189398e-06,
"loss": 1.683,
"step": 2850
},
{
"epoch": 0.09208103130755065,
"eval_loss": 1.7077971696853638,
"eval_runtime": 49.9011,
"eval_samples_per_second": 10.04,
"eval_steps_per_second": 10.04,
"step": 2850
},
{
"epoch": 0.09240412264547188,
"grad_norm": 7.655306816101074,
"learning_rate": 8.972805944102928e-06,
"loss": 1.7857,
"step": 2860
},
{
"epoch": 0.0927272139833931,
"grad_norm": 10.282003402709961,
"learning_rate": 8.902581185090555e-06,
"loss": 1.6909,
"step": 2870
},
{
"epoch": 0.09305030532131434,
"grad_norm": 7.6710100173950195,
"learning_rate": 8.832450055817064e-06,
"loss": 1.7883,
"step": 2880
},
{
"epoch": 0.09337339665923557,
"grad_norm": 7.631242275238037,
"learning_rate": 8.7624154390985e-06,
"loss": 1.6346,
"step": 2890
},
{
"epoch": 0.0936964879971568,
"grad_norm": 7.102456092834473,
"learning_rate": 8.692480213783649e-06,
"loss": 1.8356,
"step": 2900
},
{
"epoch": 0.09401957933507803,
"grad_norm": 7.865874767303467,
"learning_rate": 8.622647254635703e-06,
"loss": 1.821,
"step": 2910
},
{
"epoch": 0.09434267067299926,
"grad_norm": 8.540060997009277,
"learning_rate": 8.552919432214097e-06,
"loss": 1.6552,
"step": 2920
},
{
"epoch": 0.09466576201092049,
"grad_norm": 8.773176193237305,
"learning_rate": 8.483299612756505e-06,
"loss": 1.9238,
"step": 2930
},
{
"epoch": 0.09498885334884172,
"grad_norm": 8.697489738464355,
"learning_rate": 8.413790658061028e-06,
"loss": 1.7673,
"step": 2940
},
{
"epoch": 0.09531194468676295,
"grad_norm": 8.043781280517578,
"learning_rate": 8.344395425368537e-06,
"loss": 1.8077,
"step": 2950
},
{
"epoch": 0.09563503602468418,
"grad_norm": 12.204351425170898,
"learning_rate": 8.275116767245251e-06,
"loss": 1.6457,
"step": 2960
},
{
"epoch": 0.09595812736260541,
"grad_norm": 7.778212070465088,
"learning_rate": 8.205957531465456e-06,
"loss": 1.5633,
"step": 2970
},
{
"epoch": 0.09628121870052664,
"grad_norm": 8.191123008728027,
"learning_rate": 8.136920560894458e-06,
"loss": 1.8152,
"step": 2980
},
{
"epoch": 0.09660431003844787,
"grad_norm": 12.325820922851562,
"learning_rate": 8.068008693371723e-06,
"loss": 1.694,
"step": 2990
},
{
"epoch": 0.0969274013763691,
"grad_norm": 10.698838233947754,
"learning_rate": 7.999224761594206e-06,
"loss": 1.9075,
"step": 3000
},
{
"epoch": 0.0969274013763691,
"eval_loss": 1.7048513889312744,
"eval_runtime": 49.0658,
"eval_samples_per_second": 10.211,
"eval_steps_per_second": 10.211,
"step": 3000
},
{
"epoch": 0.09725049271429033,
"grad_norm": 7.618257999420166,
"learning_rate": 7.930571592999942e-06,
"loss": 1.6903,
"step": 3010
},
{
"epoch": 0.09757358405221156,
"grad_norm": 7.423304080963135,
"learning_rate": 7.86205200965179e-06,
"loss": 1.8027,
"step": 3020
},
{
"epoch": 0.09789667539013279,
"grad_norm": 10.209698677062988,
"learning_rate": 7.793668828121457e-06,
"loss": 1.6316,
"step": 3030
},
{
"epoch": 0.09821976672805402,
"grad_norm": 8.266161918640137,
"learning_rate": 7.725424859373688e-06,
"loss": 1.751,
"step": 3040
},
{
"epoch": 0.09854285806597525,
"grad_norm": 13.602888107299805,
"learning_rate": 7.65732290865075e-06,
"loss": 1.9036,
"step": 3050
},
{
"epoch": 0.09886594940389648,
"grad_norm": 9.627706527709961,
"learning_rate": 7.589365775357096e-06,
"loss": 1.5739,
"step": 3060
},
{
"epoch": 0.09918904074181771,
"grad_norm": 8.23000431060791,
"learning_rate": 7.52155625294431e-06,
"loss": 1.8857,
"step": 3070
},
{
"epoch": 0.09951213207973894,
"grad_norm": 9.111807823181152,
"learning_rate": 7.453897128796269e-06,
"loss": 1.6359,
"step": 3080
},
{
"epoch": 0.09983522341766017,
"grad_norm": 7.767091274261475,
"learning_rate": 7.386391184114558e-06,
"loss": 1.7635,
"step": 3090
},
{
"epoch": 0.1001583147555814,
"grad_norm": 7.9762797355651855,
"learning_rate": 7.319041193804161e-06,
"loss": 1.5899,
"step": 3100
},
{
"epoch": 0.10048140609350263,
"grad_norm": 15.16207218170166,
"learning_rate": 7.2518499263593866e-06,
"loss": 1.6362,
"step": 3110
},
{
"epoch": 0.10080449743142386,
"grad_norm": 9.359869003295898,
"learning_rate": 7.184820143750079e-06,
"loss": 1.687,
"step": 3120
},
{
"epoch": 0.10112758876934509,
"grad_norm": 8.03466796875,
"learning_rate": 7.117954601308052e-06,
"loss": 1.6855,
"step": 3130
},
{
"epoch": 0.10145068010726632,
"grad_norm": 8.714197158813477,
"learning_rate": 7.051256047613866e-06,
"loss": 1.6671,
"step": 3140
},
{
"epoch": 0.10177377144518755,
"grad_norm": 7.063264846801758,
"learning_rate": 6.984727224383822e-06,
"loss": 1.7538,
"step": 3150
},
{
"epoch": 0.10177377144518755,
"eval_loss": 1.702789545059204,
"eval_runtime": 48.5181,
"eval_samples_per_second": 10.326,
"eval_steps_per_second": 10.326,
"step": 3150
},
{
"epoch": 0.10209686278310878,
"grad_norm": 8.911219596862793,
"learning_rate": 6.918370866357266e-06,
"loss": 1.7791,
"step": 3160
},
{
"epoch": 0.10241995412103001,
"grad_norm": 9.150374412536621,
"learning_rate": 6.852189701184183e-06,
"loss": 1.6636,
"step": 3170
},
{
"epoch": 0.10274304545895124,
"grad_norm": 5.501253128051758,
"learning_rate": 6.786186449313051e-06,
"loss": 1.7729,
"step": 3180
},
{
"epoch": 0.10306613679687247,
"grad_norm": 8.506068229675293,
"learning_rate": 6.720363823879042e-06,
"loss": 1.6332,
"step": 3190
},
{
"epoch": 0.1033892281347937,
"grad_norm": 7.7138895988464355,
"learning_rate": 6.6547245305924765e-06,
"loss": 1.6499,
"step": 3200
},
{
"epoch": 0.10371231947271493,
"grad_norm": 8.832993507385254,
"learning_rate": 6.589271267627615e-06,
"loss": 1.7456,
"step": 3210
},
{
"epoch": 0.10403541081063616,
"grad_norm": 7.764387130737305,
"learning_rate": 6.524006725511727e-06,
"loss": 1.7079,
"step": 3220
},
{
"epoch": 0.1043585021485574,
"grad_norm": 9.634687423706055,
"learning_rate": 6.4589335870145165e-06,
"loss": 1.7564,
"step": 3230
},
{
"epoch": 0.10468159348647862,
"grad_norm": 12.632416725158691,
"learning_rate": 6.394054527037837e-06,
"loss": 1.659,
"step": 3240
},
{
"epoch": 0.10500468482439985,
"grad_norm": 6.407688140869141,
"learning_rate": 6.329372212505727e-06,
"loss": 1.6707,
"step": 3250
},
{
"epoch": 0.10532777616232109,
"grad_norm": 14.8198823928833,
"learning_rate": 6.264889302254797e-06,
"loss": 1.7245,
"step": 3260
},
{
"epoch": 0.10565086750024232,
"grad_norm": 9.50849437713623,
"learning_rate": 6.200608446924922e-06,
"loss": 1.9256,
"step": 3270
},
{
"epoch": 0.10597395883816355,
"grad_norm": 10.970282554626465,
"learning_rate": 6.136532288850295e-06,
"loss": 1.5962,
"step": 3280
},
{
"epoch": 0.10629705017608478,
"grad_norm": 12.26534652709961,
"learning_rate": 6.072663461950806e-06,
"loss": 1.8693,
"step": 3290
},
{
"epoch": 0.106620141514006,
"grad_norm": 13.819258689880371,
"learning_rate": 6.009004591623776e-06,
"loss": 1.728,
"step": 3300
},
{
"epoch": 0.106620141514006,
"eval_loss": 1.6982126235961914,
"eval_runtime": 52.4496,
"eval_samples_per_second": 9.552,
"eval_steps_per_second": 9.552,
"step": 3300
},
{
"epoch": 0.10694323285192724,
"grad_norm": 13.012596130371094,
"learning_rate": 5.945558294636019e-06,
"loss": 1.6098,
"step": 3310
},
{
"epoch": 0.10726632418984847,
"grad_norm": 9.684890747070312,
"learning_rate": 5.882327179016307e-06,
"loss": 1.5688,
"step": 3320
},
{
"epoch": 0.1075894155277697,
"grad_norm": 8.362608909606934,
"learning_rate": 5.819313843948146e-06,
"loss": 1.531,
"step": 3330
},
{
"epoch": 0.10791250686569093,
"grad_norm": 6.872709274291992,
"learning_rate": 5.756520879662929e-06,
"loss": 1.8154,
"step": 3340
},
{
"epoch": 0.10823559820361216,
"grad_norm": 7.6126275062561035,
"learning_rate": 5.693950867333488e-06,
"loss": 1.6701,
"step": 3350
},
{
"epoch": 0.10855868954153339,
"grad_norm": 11.798931121826172,
"learning_rate": 5.6316063789679415e-06,
"loss": 1.6676,
"step": 3360
},
{
"epoch": 0.10888178087945462,
"grad_norm": 8.72461986541748,
"learning_rate": 5.569489977304029e-06,
"loss": 1.6311,
"step": 3370
},
{
"epoch": 0.10920487221737585,
"grad_norm": 10.633599281311035,
"learning_rate": 5.507604215703729e-06,
"loss": 1.8677,
"step": 3380
},
{
"epoch": 0.10952796355529708,
"grad_norm": 10.684865951538086,
"learning_rate": 5.44595163804831e-06,
"loss": 1.5938,
"step": 3390
},
{
"epoch": 0.10985105489321831,
"grad_norm": 8.996574401855469,
"learning_rate": 5.384534778633763e-06,
"loss": 1.7584,
"step": 3400
},
{
"epoch": 0.11017414623113954,
"grad_norm": 9.916984558105469,
"learning_rate": 5.323356162066626e-06,
"loss": 1.6673,
"step": 3410
},
{
"epoch": 0.11049723756906077,
"grad_norm": 8.516958236694336,
"learning_rate": 5.262418303160206e-06,
"loss": 1.7681,
"step": 3420
},
{
"epoch": 0.110820328906982,
"grad_norm": 9.242239952087402,
"learning_rate": 5.201723706831204e-06,
"loss": 1.8336,
"step": 3430
},
{
"epoch": 0.11114342024490323,
"grad_norm": 8.185578346252441,
"learning_rate": 5.141274867996755e-06,
"loss": 1.7451,
"step": 3440
},
{
"epoch": 0.11146651158282446,
"grad_norm": 7.650656700134277,
"learning_rate": 5.081074271471855e-06,
"loss": 1.6938,
"step": 3450
},
{
"epoch": 0.11146651158282446,
"eval_loss": 1.698238730430603,
"eval_runtime": 48.6823,
"eval_samples_per_second": 10.291,
"eval_steps_per_second": 10.291,
"step": 3450
},
{
"epoch": 0.11178960292074569,
"grad_norm": 7.296802997589111,
"learning_rate": 5.021124391867241e-06,
"loss": 1.8332,
"step": 3460
},
{
"epoch": 0.11211269425866692,
"grad_norm": 13.779586791992188,
"learning_rate": 4.961427693487654e-06,
"loss": 1.7706,
"step": 3470
},
{
"epoch": 0.11243578559658815,
"grad_norm": 9.145750045776367,
"learning_rate": 4.901986630230549e-06,
"loss": 1.6575,
"step": 3480
},
{
"epoch": 0.11275887693450938,
"grad_norm": 8.650459289550781,
"learning_rate": 4.842803645485228e-06,
"loss": 1.7603,
"step": 3490
},
{
"epoch": 0.11308196827243061,
"grad_norm": 11.718855857849121,
"learning_rate": 4.7838811720323795e-06,
"loss": 1.7843,
"step": 3500
},
{
"epoch": 0.11340505961035184,
"grad_norm": 11.075895309448242,
"learning_rate": 4.725221631944109e-06,
"loss": 1.794,
"step": 3510
},
{
"epoch": 0.11372815094827307,
"grad_norm": 9.832283020019531,
"learning_rate": 4.666827436484355e-06,
"loss": 1.7766,
"step": 3520
},
{
"epoch": 0.1140512422861943,
"grad_norm": 8.829045295715332,
"learning_rate": 4.60870098600978e-06,
"loss": 1.6138,
"step": 3530
},
{
"epoch": 0.11437433362411553,
"grad_norm": 12.860816955566406,
"learning_rate": 4.550844669871095e-06,
"loss": 1.6731,
"step": 3540
},
{
"epoch": 0.11469742496203676,
"grad_norm": 10.444979667663574,
"learning_rate": 4.493260866314851e-06,
"loss": 1.7394,
"step": 3550
},
{
"epoch": 0.11502051629995799,
"grad_norm": 12.831863403320312,
"learning_rate": 4.435951942385671e-06,
"loss": 1.8,
"step": 3560
},
{
"epoch": 0.11534360763787922,
"grad_norm": 11.071577072143555,
"learning_rate": 4.378920253828953e-06,
"loss": 1.7539,
"step": 3570
},
{
"epoch": 0.11566669897580045,
"grad_norm": 6.664394378662109,
"learning_rate": 4.322168144994041e-06,
"loss": 1.741,
"step": 3580
},
{
"epoch": 0.11598979031372168,
"grad_norm": 9.661178588867188,
"learning_rate": 4.265697948737836e-06,
"loss": 1.6935,
"step": 3590
},
{
"epoch": 0.11631288165164291,
"grad_norm": 7.41594934463501,
"learning_rate": 4.209511986328935e-06,
"loss": 1.6957,
"step": 3600
},
{
"epoch": 0.11631288165164291,
"eval_loss": 1.6976414918899536,
"eval_runtime": 49.1478,
"eval_samples_per_second": 10.194,
"eval_steps_per_second": 10.194,
"step": 3600
},
{
"epoch": 0.11663597298956414,
"grad_norm": 8.112351417541504,
"learning_rate": 4.153612567352186e-06,
"loss": 1.6652,
"step": 3610
},
{
"epoch": 0.11695906432748537,
"grad_norm": 8.311092376708984,
"learning_rate": 4.098001989613763e-06,
"loss": 1.7136,
"step": 3620
},
{
"epoch": 0.1172821556654066,
"grad_norm": 10.395671844482422,
"learning_rate": 4.042682539046698e-06,
"loss": 1.7171,
"step": 3630
},
{
"epoch": 0.11760524700332783,
"grad_norm": 10.424238204956055,
"learning_rate": 3.987656489616937e-06,
"loss": 1.7596,
"step": 3640
},
{
"epoch": 0.11792833834124906,
"grad_norm": 7.314935684204102,
"learning_rate": 3.932926103229849e-06,
"loss": 1.6147,
"step": 3650
},
{
"epoch": 0.1182514296791703,
"grad_norm": 10.446200370788574,
"learning_rate": 3.878493629637249e-06,
"loss": 1.6668,
"step": 3660
},
{
"epoch": 0.11857452101709154,
"grad_norm": 7.69391393661499,
"learning_rate": 3.824361306344942e-06,
"loss": 1.8314,
"step": 3670
},
{
"epoch": 0.11889761235501277,
"grad_norm": 9.985037803649902,
"learning_rate": 3.7705313585207056e-06,
"loss": 1.747,
"step": 3680
},
{
"epoch": 0.119220703692934,
"grad_norm": 8.62292194366455,
"learning_rate": 3.717005998902859e-06,
"loss": 1.8816,
"step": 3690
},
{
"epoch": 0.11954379503085523,
"grad_norm": 8.038886070251465,
"learning_rate": 3.6637874277092946e-06,
"loss": 1.7347,
"step": 3700
},
{
"epoch": 0.11986688636877646,
"grad_norm": 9.587797164916992,
"learning_rate": 3.610877832547034e-06,
"loss": 1.7352,
"step": 3710
},
{
"epoch": 0.12018997770669769,
"grad_norm": 7.319656848907471,
"learning_rate": 3.5582793883222923e-06,
"loss": 1.8048,
"step": 3720
},
{
"epoch": 0.12051306904461892,
"grad_norm": 11.947138786315918,
"learning_rate": 3.5059942571511037e-06,
"loss": 1.6342,
"step": 3730
},
{
"epoch": 0.12083616038254015,
"grad_norm": 10.702777862548828,
"learning_rate": 3.4540245882704213e-06,
"loss": 1.6056,
"step": 3740
},
{
"epoch": 0.12115925172046138,
"grad_norm": 8.408270835876465,
"learning_rate": 3.4023725179497848e-06,
"loss": 1.6606,
"step": 3750
},
{
"epoch": 0.12115925172046138,
"eval_loss": 1.6962512731552124,
"eval_runtime": 49.6842,
"eval_samples_per_second": 10.084,
"eval_steps_per_second": 10.084,
"step": 3750
},
{
"epoch": 0.12148234305838261,
"grad_norm": 8.158541679382324,
"learning_rate": 3.351040169403499e-06,
"loss": 1.5587,
"step": 3760
},
{
"epoch": 0.12180543439630384,
"grad_norm": 7.553473949432373,
"learning_rate": 3.30002965270335e-06,
"loss": 1.5088,
"step": 3770
},
{
"epoch": 0.12212852573422507,
"grad_norm": 7.5580267906188965,
"learning_rate": 3.2493430646918865e-06,
"loss": 1.6262,
"step": 3780
},
{
"epoch": 0.1224516170721463,
"grad_norm": 9.70615005493164,
"learning_rate": 3.1989824888962225e-06,
"loss": 1.8517,
"step": 3790
},
{
"epoch": 0.12277470841006753,
"grad_norm": 10.053832054138184,
"learning_rate": 3.1489499954423797e-06,
"loss": 1.8122,
"step": 3800
},
{
"epoch": 0.12309779974798876,
"grad_norm": 7.846384048461914,
"learning_rate": 3.0992476409701936e-06,
"loss": 1.7561,
"step": 3810
},
{
"epoch": 0.12342089108590999,
"grad_norm": 8.455656051635742,
"learning_rate": 3.0498774685487882e-06,
"loss": 1.5963,
"step": 3820
},
{
"epoch": 0.12374398242383122,
"grad_norm": 9.971628189086914,
"learning_rate": 3.000841507592583e-06,
"loss": 1.7193,
"step": 3830
},
{
"epoch": 0.12406707376175245,
"grad_norm": 9.318702697753906,
"learning_rate": 2.9521417737778717e-06,
"loss": 1.6967,
"step": 3840
},
{
"epoch": 0.12439016509967368,
"grad_norm": 7.269077301025391,
"learning_rate": 2.9037802689599704e-06,
"loss": 1.7184,
"step": 3850
},
{
"epoch": 0.12471325643759491,
"grad_norm": 9.587904930114746,
"learning_rate": 2.855758981090918e-06,
"loss": 1.6776,
"step": 3860
},
{
"epoch": 0.12503634777551614,
"grad_norm": 8.016568183898926,
"learning_rate": 2.8080798841377743e-06,
"loss": 1.6396,
"step": 3870
},
{
"epoch": 0.12535943911343736,
"grad_norm": 9.123952865600586,
"learning_rate": 2.7607449380014703e-06,
"loss": 1.7405,
"step": 3880
},
{
"epoch": 0.1256825304513586,
"grad_norm": 9.068984031677246,
"learning_rate": 2.713756088436244e-06,
"loss": 1.6592,
"step": 3890
},
{
"epoch": 0.12600562178927982,
"grad_norm": 8.082880020141602,
"learning_rate": 2.6671152669696515e-06,
"loss": 1.6929,
"step": 3900
},
{
"epoch": 0.12600562178927982,
"eval_loss": 1.6942435503005981,
"eval_runtime": 45.0696,
"eval_samples_per_second": 11.116,
"eval_steps_per_second": 11.116,
"step": 3900
},
{
"epoch": 0.12632871312720106,
"grad_norm": 9.814338684082031,
"learning_rate": 2.6208243908231916e-06,
"loss": 1.6763,
"step": 3910
},
{
"epoch": 0.12665180446512228,
"grad_norm": 7.890253067016602,
"learning_rate": 2.57488536283347e-06,
"loss": 1.7072,
"step": 3920
},
{
"epoch": 0.12697489580304352,
"grad_norm": 7.890439510345459,
"learning_rate": 2.5293000713739977e-06,
"loss": 1.8804,
"step": 3930
},
{
"epoch": 0.12729798714096474,
"grad_norm": 7.131802082061768,
"learning_rate": 2.4840703902775642e-06,
"loss": 1.7909,
"step": 3940
},
{
"epoch": 0.12762107847888599,
"grad_norm": 7.198910713195801,
"learning_rate": 2.4391981787592005e-06,
"loss": 1.7132,
"step": 3950
},
{
"epoch": 0.1279441698168072,
"grad_norm": 10.655743598937988,
"learning_rate": 2.3946852813397737e-06,
"loss": 1.6567,
"step": 3960
},
{
"epoch": 0.12826726115472845,
"grad_norm": 16.728164672851562,
"learning_rate": 2.3505335277701494e-06,
"loss": 1.7548,
"step": 3970
},
{
"epoch": 0.12859035249264966,
"grad_norm": 10.79677963256836,
"learning_rate": 2.306744732955991e-06,
"loss": 1.7443,
"step": 3980
},
{
"epoch": 0.1289134438305709,
"grad_norm": 11.85908317565918,
"learning_rate": 2.2633206968831374e-06,
"loss": 1.6827,
"step": 3990
},
{
"epoch": 0.12923653516849212,
"grad_norm": 7.376532554626465,
"learning_rate": 2.220263204543635e-06,
"loss": 1.6962,
"step": 4000
},
{
"epoch": 0.12955962650641337,
"grad_norm": 9.859908103942871,
"learning_rate": 2.1775740258623492e-06,
"loss": 1.7198,
"step": 4010
},
{
"epoch": 0.12988271784433458,
"grad_norm": 8.99048900604248,
"learning_rate": 2.1352549156242126e-06,
"loss": 1.8287,
"step": 4020
},
{
"epoch": 0.13020580918225583,
"grad_norm": 10.75970458984375,
"learning_rate": 2.0933076134020958e-06,
"loss": 1.7958,
"step": 4030
},
{
"epoch": 0.13052890052017704,
"grad_norm": 11.328814506530762,
"learning_rate": 2.0517338434852946e-06,
"loss": 1.6765,
"step": 4040
},
{
"epoch": 0.1308519918580983,
"grad_norm": 9.293211936950684,
"learning_rate": 2.010535314808659e-06,
"loss": 1.7851,
"step": 4050
},
{
"epoch": 0.1308519918580983,
"eval_loss": 1.6941063404083252,
"eval_runtime": 50.0573,
"eval_samples_per_second": 10.009,
"eval_steps_per_second": 10.009,
"step": 4050
},
{
"epoch": 0.1311750831960195,
"grad_norm": 9.791179656982422,
"learning_rate": 1.9697137208823396e-06,
"loss": 1.8117,
"step": 4060
},
{
"epoch": 0.13149817453394075,
"grad_norm": 8.991856575012207,
"learning_rate": 1.9292707397221775e-06,
"loss": 1.6096,
"step": 4070
},
{
"epoch": 0.13182126587186196,
"grad_norm": 9.120182991027832,
"learning_rate": 1.8892080337807171e-06,
"loss": 1.7314,
"step": 4080
},
{
"epoch": 0.1321443572097832,
"grad_norm": 10.0064697265625,
"learning_rate": 1.8495272498788887e-06,
"loss": 1.6805,
"step": 4090
},
{
"epoch": 0.13246744854770443,
"grad_norm": 6.52011251449585,
"learning_rate": 1.8102300191383008e-06,
"loss": 1.6526,
"step": 4100
},
{
"epoch": 0.13279053988562567,
"grad_norm": 7.259543418884277,
"learning_rate": 1.7713179569141897e-06,
"loss": 1.8271,
"step": 4110
},
{
"epoch": 0.13311363122354689,
"grad_norm": 10.133992195129395,
"learning_rate": 1.7327926627290298e-06,
"loss": 1.7711,
"step": 4120
},
{
"epoch": 0.13343672256146813,
"grad_norm": 7.308538913726807,
"learning_rate": 1.6946557202067662e-06,
"loss": 1.6875,
"step": 4130
},
{
"epoch": 0.13375981389938935,
"grad_norm": 10.048575401306152,
"learning_rate": 1.6569086970077352e-06,
"loss": 1.705,
"step": 4140
},
{
"epoch": 0.1340829052373106,
"grad_norm": 10.202974319458008,
"learning_rate": 1.6195531447642177e-06,
"loss": 1.755,
"step": 4150
},
{
"epoch": 0.1344059965752318,
"grad_norm": 9.808621406555176,
"learning_rate": 1.582590599016653e-06,
"loss": 1.7341,
"step": 4160
},
{
"epoch": 0.13472908791315305,
"grad_norm": 13.069711685180664,
"learning_rate": 1.5460225791505258e-06,
"loss": 1.621,
"step": 4170
},
{
"epoch": 0.13505217925107427,
"grad_norm": 9.029712677001953,
"learning_rate": 1.509850588333905e-06,
"loss": 1.7657,
"step": 4180
},
{
"epoch": 0.1353752705889955,
"grad_norm": 9.384182929992676,
"learning_rate": 1.4740761134556557e-06,
"loss": 1.7284,
"step": 4190
},
{
"epoch": 0.13569836192691673,
"grad_norm": 12.77877426147461,
"learning_rate": 1.4387006250643236e-06,
"loss": 1.6129,
"step": 4200
},
{
"epoch": 0.13569836192691673,
"eval_loss": 1.6926029920578003,
"eval_runtime": 45.4397,
"eval_samples_per_second": 11.026,
"eval_steps_per_second": 11.026,
"step": 4200
},
{
"epoch": 0.13602145326483797,
"grad_norm": 7.0634846687316895,
"learning_rate": 1.4037255773076804e-06,
"loss": 1.7018,
"step": 4210
},
{
"epoch": 0.1363445446027592,
"grad_norm": 9.692054748535156,
"learning_rate": 1.3691524078729481e-06,
"loss": 1.7729,
"step": 4220
},
{
"epoch": 0.13666763594068043,
"grad_norm": 7.554727554321289,
"learning_rate": 1.3349825379277099e-06,
"loss": 1.5513,
"step": 4230
},
{
"epoch": 0.13699072727860165,
"grad_norm": 10.388784408569336,
"learning_rate": 1.3012173720614862e-06,
"loss": 1.7794,
"step": 4240
},
{
"epoch": 0.1373138186165229,
"grad_norm": 8.613759994506836,
"learning_rate": 1.267858298227995e-06,
"loss": 1.8116,
"step": 4250
},
{
"epoch": 0.1376369099544441,
"grad_norm": 9.194704055786133,
"learning_rate": 1.2349066876881063e-06,
"loss": 1.7516,
"step": 4260
},
{
"epoch": 0.13796000129236535,
"grad_norm": 7.637813091278076,
"learning_rate": 1.202363894953462e-06,
"loss": 1.8919,
"step": 4270
},
{
"epoch": 0.13828309263028657,
"grad_norm": 7.157520771026611,
"learning_rate": 1.1702312577308133e-06,
"loss": 1.8295,
"step": 4280
},
{
"epoch": 0.1386061839682078,
"grad_norm": 7.413049221038818,
"learning_rate": 1.1385100968670189e-06,
"loss": 1.6244,
"step": 4290
},
{
"epoch": 0.13892927530612903,
"grad_norm": 13.795228958129883,
"learning_rate": 1.107201716294762e-06,
"loss": 1.6479,
"step": 4300
},
{
"epoch": 0.13925236664405027,
"grad_norm": 12.399872779846191,
"learning_rate": 1.076307402978938e-06,
"loss": 1.6793,
"step": 4310
},
{
"epoch": 0.1395754579819715,
"grad_norm": 8.875575065612793,
"learning_rate": 1.0458284268637652e-06,
"loss": 1.7353,
"step": 4320
},
{
"epoch": 0.13989854931989273,
"grad_norm": 7.911688804626465,
"learning_rate": 1.0157660408205728e-06,
"loss": 1.6169,
"step": 4330
},
{
"epoch": 0.14022164065781395,
"grad_norm": 9.808359146118164,
"learning_rate": 9.861214805963042e-07,
"loss": 1.7996,
"step": 4340
},
{
"epoch": 0.1405447319957352,
"grad_norm": 10.307291030883789,
"learning_rate": 9.568959647627223e-07,
"loss": 1.6639,
"step": 4350
},
{
"epoch": 0.1405447319957352,
"eval_loss": 1.6928385496139526,
"eval_runtime": 49.1761,
"eval_samples_per_second": 10.188,
"eval_steps_per_second": 10.188,
"step": 4350
},
{
"epoch": 0.1408678233336564,
"grad_norm": 12.124625205993652,
"learning_rate": 9.280906946663111e-07,
"loss": 1.797,
"step": 4360
},
{
"epoch": 0.14119091467157766,
"grad_norm": 10.66218376159668,
"learning_rate": 8.997068543789051e-07,
"loss": 1.6894,
"step": 4370
},
{
"epoch": 0.14151400600949887,
"grad_norm": 11.1690673828125,
"learning_rate": 8.717456106490042e-07,
"loss": 1.7584,
"step": 4380
},
{
"epoch": 0.14183709734742012,
"grad_norm": 9.9891357421875,
"learning_rate": 8.442081128538243e-07,
"loss": 1.6333,
"step": 4390
},
{
"epoch": 0.14216018868534133,
"grad_norm": 8.535771369934082,
"learning_rate": 8.170954929520389e-07,
"loss": 1.6837,
"step": 4400
},
{
"epoch": 0.14248328002326258,
"grad_norm": 9.602922439575195,
"learning_rate": 7.904088654372622e-07,
"loss": 1.6985,
"step": 4410
},
{
"epoch": 0.14280637136118382,
"grad_norm": 6.4636688232421875,
"learning_rate": 7.641493272922243e-07,
"loss": 1.7214,
"step": 4420
},
{
"epoch": 0.14312946269910504,
"grad_norm": 10.1285982131958,
"learning_rate": 7.383179579436903e-07,
"loss": 1.7285,
"step": 4430
},
{
"epoch": 0.14345255403702628,
"grad_norm": 10.605436325073242,
"learning_rate": 7.129158192180766e-07,
"loss": 1.6752,
"step": 4440
},
{
"epoch": 0.1437756453749475,
"grad_norm": 8.596016883850098,
"learning_rate": 6.879439552978142e-07,
"loss": 1.7964,
"step": 4450
},
{
"epoch": 0.14409873671286874,
"grad_norm": 10.723006248474121,
"learning_rate": 6.634033926784221e-07,
"loss": 1.7373,
"step": 4460
},
{
"epoch": 0.14442182805078996,
"grad_norm": 7.131843090057373,
"learning_rate": 6.392951401263069e-07,
"loss": 1.7181,
"step": 4470
},
{
"epoch": 0.1447449193887112,
"grad_norm": 8.122928619384766,
"learning_rate": 6.156201886373113e-07,
"loss": 1.7157,
"step": 4480
},
{
"epoch": 0.14506801072663242,
"grad_norm": 7.9331583976745605,
"learning_rate": 5.923795113959569e-07,
"loss": 1.6884,
"step": 4490
},
{
"epoch": 0.14539110206455366,
"grad_norm": 10.320080757141113,
"learning_rate": 5.695740637354591e-07,
"loss": 1.7235,
"step": 4500
},
{
"epoch": 0.14539110206455366,
"eval_loss": 1.6921895742416382,
"eval_runtime": 49.8365,
"eval_samples_per_second": 10.053,
"eval_steps_per_second": 10.053,
"step": 4500
},
{
"epoch": 0.14571419340247488,
"grad_norm": 7.294159889221191,
"learning_rate": 5.472047830984499e-07,
"loss": 1.7577,
"step": 4510
},
{
"epoch": 0.14603728474039612,
"grad_norm": 9.507523536682129,
"learning_rate": 5.252725889984403e-07,
"loss": 1.7748,
"step": 4520
},
{
"epoch": 0.14636037607831734,
"grad_norm": 10.296547889709473,
"learning_rate": 5.037783829820298e-07,
"loss": 1.6676,
"step": 4530
},
{
"epoch": 0.14668346741623858,
"grad_norm": 10.683934211730957,
"learning_rate": 4.827230485918372e-07,
"loss": 1.703,
"step": 4540
},
{
"epoch": 0.1470065587541598,
"grad_norm": 13.149202346801758,
"learning_rate": 4.6210745133019236e-07,
"loss": 1.8596,
"step": 4550
},
{
"epoch": 0.14732965009208104,
"grad_norm": 10.627421379089355,
"learning_rate": 4.419324386235529e-07,
"loss": 1.5863,
"step": 4560
},
{
"epoch": 0.14765274143000226,
"grad_norm": 8.185441970825195,
"learning_rate": 4.2219883978767386e-07,
"loss": 1.7421,
"step": 4570
},
{
"epoch": 0.1479758327679235,
"grad_norm": 6.5582804679870605,
"learning_rate": 4.029074659935082e-07,
"loss": 1.7486,
"step": 4580
},
{
"epoch": 0.14829892410584472,
"grad_norm": 7.293984413146973,
"learning_rate": 3.8405911023387444e-07,
"loss": 1.7631,
"step": 4590
},
{
"epoch": 0.14862201544376596,
"grad_norm": 10.495855331420898,
"learning_rate": 3.6565454729085526e-07,
"loss": 1.8289,
"step": 4600
},
{
"epoch": 0.14894510678168718,
"grad_norm": 7.07685661315918,
"learning_rate": 3.4769453370394753e-07,
"loss": 1.6386,
"step": 4610
},
{
"epoch": 0.14926819811960843,
"grad_norm": 8.069764137268066,
"learning_rate": 3.301798077389637e-07,
"loss": 1.585,
"step": 4620
},
{
"epoch": 0.14959128945752964,
"grad_norm": 8.399779319763184,
"learning_rate": 3.1311108935768926e-07,
"loss": 1.5544,
"step": 4630
},
{
"epoch": 0.14991438079545089,
"grad_norm": 7.10072660446167,
"learning_rate": 2.964890801882817e-07,
"loss": 1.7765,
"step": 4640
},
{
"epoch": 0.1502374721333721,
"grad_norm": 12.693696022033691,
"learning_rate": 2.8031446349643393e-07,
"loss": 1.5691,
"step": 4650
},
{
"epoch": 0.1502374721333721,
"eval_loss": 1.6924811601638794,
"eval_runtime": 50.4888,
"eval_samples_per_second": 9.923,
"eval_steps_per_second": 9.923,
"step": 4650
},
{
"epoch": 0.15056056347129335,
"grad_norm": 8.841912269592285,
"learning_rate": 2.645879041572891e-07,
"loss": 1.6589,
"step": 4660
},
{
"epoch": 0.15088365480921456,
"grad_norm": 7.690126895904541,
"learning_rate": 2.4931004862810295e-07,
"loss": 1.7137,
"step": 4670
},
{
"epoch": 0.1512067461471358,
"grad_norm": 14.600467681884766,
"learning_rate": 2.3448152492167586e-07,
"loss": 1.8001,
"step": 4680
},
{
"epoch": 0.15152983748505702,
"grad_norm": 8.619688034057617,
"learning_rate": 2.201029425805393e-07,
"loss": 1.7615,
"step": 4690
},
{
"epoch": 0.15185292882297827,
"grad_norm": 12.033727645874023,
"learning_rate": 2.061748926518972e-07,
"loss": 1.6317,
"step": 4700
},
{
"epoch": 0.15217602016089948,
"grad_norm": 9.276659965515137,
"learning_rate": 1.9269794766333073e-07,
"loss": 1.6155,
"step": 4710
},
{
"epoch": 0.15249911149882073,
"grad_norm": 8.645523071289062,
"learning_rate": 1.7967266159925864e-07,
"loss": 1.5958,
"step": 4720
},
{
"epoch": 0.15282220283674194,
"grad_norm": 13.718961715698242,
"learning_rate": 1.670995698781777e-07,
"loss": 1.5768,
"step": 4730
},
{
"epoch": 0.1531452941746632,
"grad_norm": 12.2525634765625,
"learning_rate": 1.549791893306424e-07,
"loss": 1.571,
"step": 4740
},
{
"epoch": 0.1534683855125844,
"grad_norm": 7.851583003997803,
"learning_rate": 1.4331201817802332e-07,
"loss": 1.7923,
"step": 4750
},
{
"epoch": 0.15379147685050565,
"grad_norm": 10.048659324645996,
"learning_rate": 1.320985360120322e-07,
"loss": 1.7102,
"step": 4760
},
{
"epoch": 0.15411456818842686,
"grad_norm": 9.430795669555664,
"learning_rate": 1.2133920377499848e-07,
"loss": 1.6879,
"step": 4770
},
{
"epoch": 0.1544376595263481,
"grad_norm": 12.329809188842773,
"learning_rate": 1.1103446374092981e-07,
"loss": 1.7557,
"step": 4780
},
{
"epoch": 0.15476075086426933,
"grad_norm": 11.180129051208496,
"learning_rate": 1.0118473949732765e-07,
"loss": 1.7791,
"step": 4790
},
{
"epoch": 0.15508384220219057,
"grad_norm": 8.690634727478027,
"learning_rate": 9.179043592777716e-08,
"loss": 1.6464,
"step": 4800
},
{
"epoch": 0.15508384220219057,
"eval_loss": 1.6925097703933716,
"eval_runtime": 44.8573,
"eval_samples_per_second": 11.169,
"eval_steps_per_second": 11.169,
"step": 4800
}
],
"logging_steps": 10,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.0155209275981824e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}