zephyr-7b-sft-full / trainer_state.json
fhalation's picture
Model save
d1e8b41 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1797,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0027824151363383415,
"grad_norm": 11.703799339759724,
"learning_rate": 4.444444444444445e-07,
"loss": 1.1659,
"mean_token_accuracy": 0.6976747632026672,
"num_tokens": 585246.0,
"step": 5
},
{
"epoch": 0.005564830272676683,
"grad_norm": 5.602071651449005,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.0864,
"mean_token_accuracy": 0.7083896398544312,
"num_tokens": 1171311.0,
"step": 10
},
{
"epoch": 0.008347245409015025,
"grad_norm": 3.1420241423044755,
"learning_rate": 1.5555555555555558e-06,
"loss": 1.0324,
"mean_token_accuracy": 0.7197834849357605,
"num_tokens": 1757884.0,
"step": 15
},
{
"epoch": 0.011129660545353366,
"grad_norm": 2.934538436088403,
"learning_rate": 2.1111111111111114e-06,
"loss": 1.0212,
"mean_token_accuracy": 0.7205601453781127,
"num_tokens": 2351106.0,
"step": 20
},
{
"epoch": 0.013912075681691708,
"grad_norm": 2.7899043139645996,
"learning_rate": 2.666666666666667e-06,
"loss": 0.9897,
"mean_token_accuracy": 0.7282361865043641,
"num_tokens": 2942453.0,
"step": 25
},
{
"epoch": 0.01669449081803005,
"grad_norm": 2.618414182725373,
"learning_rate": 3.2222222222222227e-06,
"loss": 0.9967,
"mean_token_accuracy": 0.7268516778945923,
"num_tokens": 3524350.0,
"step": 30
},
{
"epoch": 0.019476905954368393,
"grad_norm": 3.312970663728368,
"learning_rate": 3.777777777777778e-06,
"loss": 1.0133,
"mean_token_accuracy": 0.7244957327842713,
"num_tokens": 4116171.0,
"step": 35
},
{
"epoch": 0.022259321090706732,
"grad_norm": 2.5606592497391945,
"learning_rate": 4.333333333333334e-06,
"loss": 1.0046,
"mean_token_accuracy": 0.724289059638977,
"num_tokens": 4703551.0,
"step": 40
},
{
"epoch": 0.025041736227045076,
"grad_norm": 3.539575821864669,
"learning_rate": 4.888888888888889e-06,
"loss": 1.0161,
"mean_token_accuracy": 0.7216586589813232,
"num_tokens": 5283285.0,
"step": 45
},
{
"epoch": 0.027824151363383415,
"grad_norm": 2.9309023782989936,
"learning_rate": 5.444444444444445e-06,
"loss": 1.0203,
"mean_token_accuracy": 0.7215822339057922,
"num_tokens": 5876874.0,
"step": 50
},
{
"epoch": 0.03060656649972176,
"grad_norm": 3.284413853809602,
"learning_rate": 6e-06,
"loss": 0.9989,
"mean_token_accuracy": 0.7253866314888,
"num_tokens": 6470253.0,
"step": 55
},
{
"epoch": 0.0333889816360601,
"grad_norm": 3.035514802811082,
"learning_rate": 6.555555555555556e-06,
"loss": 0.9791,
"mean_token_accuracy": 0.7286934494972229,
"num_tokens": 7056051.0,
"step": 60
},
{
"epoch": 0.036171396772398445,
"grad_norm": 2.8291474051822996,
"learning_rate": 7.111111111111112e-06,
"loss": 0.9859,
"mean_token_accuracy": 0.7266968131065369,
"num_tokens": 7640554.0,
"step": 65
},
{
"epoch": 0.038953811908736785,
"grad_norm": 2.76432791315742,
"learning_rate": 7.666666666666667e-06,
"loss": 1.0127,
"mean_token_accuracy": 0.7222738623619079,
"num_tokens": 8235176.0,
"step": 70
},
{
"epoch": 0.041736227045075125,
"grad_norm": 3.0121797507858985,
"learning_rate": 8.222222222222222e-06,
"loss": 1.033,
"mean_token_accuracy": 0.7161361455917359,
"num_tokens": 8823675.0,
"step": 75
},
{
"epoch": 0.044518642181413465,
"grad_norm": 2.6579939949703095,
"learning_rate": 8.777777777777778e-06,
"loss": 1.0106,
"mean_token_accuracy": 0.722004747390747,
"num_tokens": 9419065.0,
"step": 80
},
{
"epoch": 0.04730105731775181,
"grad_norm": 2.756043313002429,
"learning_rate": 9.333333333333334e-06,
"loss": 1.0284,
"mean_token_accuracy": 0.7184515237808228,
"num_tokens": 10018707.0,
"step": 85
},
{
"epoch": 0.05008347245409015,
"grad_norm": 2.7330649459439487,
"learning_rate": 9.88888888888889e-06,
"loss": 1.0381,
"mean_token_accuracy": 0.716748857498169,
"num_tokens": 10615769.0,
"step": 90
},
{
"epoch": 0.05286588759042849,
"grad_norm": 3.119124532731729,
"learning_rate": 1.0444444444444445e-05,
"loss": 1.0159,
"mean_token_accuracy": 0.7196141958236695,
"num_tokens": 11206791.0,
"step": 95
},
{
"epoch": 0.05564830272676683,
"grad_norm": 2.6869099855331524,
"learning_rate": 1.1000000000000001e-05,
"loss": 1.0259,
"mean_token_accuracy": 0.7180516362190247,
"num_tokens": 11794887.0,
"step": 100
},
{
"epoch": 0.05843071786310518,
"grad_norm": 2.6746654996398225,
"learning_rate": 1.1555555555555556e-05,
"loss": 1.0068,
"mean_token_accuracy": 0.7229955673217774,
"num_tokens": 12385737.0,
"step": 105
},
{
"epoch": 0.06121313299944352,
"grad_norm": 2.9016124547228124,
"learning_rate": 1.211111111111111e-05,
"loss": 1.0407,
"mean_token_accuracy": 0.7161330699920654,
"num_tokens": 12974642.0,
"step": 110
},
{
"epoch": 0.06399554813578186,
"grad_norm": 2.7590693104720887,
"learning_rate": 1.2666666666666667e-05,
"loss": 1.034,
"mean_token_accuracy": 0.7153326869010925,
"num_tokens": 13580517.0,
"step": 115
},
{
"epoch": 0.0667779632721202,
"grad_norm": 3.0191169781226734,
"learning_rate": 1.3222222222222223e-05,
"loss": 1.0218,
"mean_token_accuracy": 0.7199317574501037,
"num_tokens": 14169869.0,
"step": 120
},
{
"epoch": 0.06956037840845854,
"grad_norm": 2.673488197212258,
"learning_rate": 1.377777777777778e-05,
"loss": 1.0589,
"mean_token_accuracy": 0.7112971425056458,
"num_tokens": 14758276.0,
"step": 125
},
{
"epoch": 0.07234279354479689,
"grad_norm": 2.7094276528268795,
"learning_rate": 1.4333333333333334e-05,
"loss": 1.049,
"mean_token_accuracy": 0.712811267375946,
"num_tokens": 15345599.0,
"step": 130
},
{
"epoch": 0.07512520868113523,
"grad_norm": 2.6373785827567957,
"learning_rate": 1.488888888888889e-05,
"loss": 1.0225,
"mean_token_accuracy": 0.7185760021209717,
"num_tokens": 15932693.0,
"step": 135
},
{
"epoch": 0.07790762381747357,
"grad_norm": 2.9632223301755154,
"learning_rate": 1.5444444444444446e-05,
"loss": 1.0654,
"mean_token_accuracy": 0.7110099554061889,
"num_tokens": 16520098.0,
"step": 140
},
{
"epoch": 0.08069003895381191,
"grad_norm": 2.817380088138206,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.0674,
"mean_token_accuracy": 0.709651243686676,
"num_tokens": 17110470.0,
"step": 145
},
{
"epoch": 0.08347245409015025,
"grad_norm": 2.6945752947019073,
"learning_rate": 1.6555555555555556e-05,
"loss": 1.054,
"mean_token_accuracy": 0.7122701048851013,
"num_tokens": 17696206.0,
"step": 150
},
{
"epoch": 0.08625486922648859,
"grad_norm": 2.397611832219796,
"learning_rate": 1.7111111111111112e-05,
"loss": 1.0735,
"mean_token_accuracy": 0.709396231174469,
"num_tokens": 18292078.0,
"step": 155
},
{
"epoch": 0.08903728436282693,
"grad_norm": 2.7655354856899517,
"learning_rate": 1.7666666666666668e-05,
"loss": 1.0809,
"mean_token_accuracy": 0.7083318114280701,
"num_tokens": 18888122.0,
"step": 160
},
{
"epoch": 0.09181969949916527,
"grad_norm": 2.758607000704,
"learning_rate": 1.8222222222222224e-05,
"loss": 1.0822,
"mean_token_accuracy": 0.706460428237915,
"num_tokens": 19484865.0,
"step": 165
},
{
"epoch": 0.09460211463550362,
"grad_norm": 2.570904335556245,
"learning_rate": 1.877777777777778e-05,
"loss": 1.0826,
"mean_token_accuracy": 0.7078906774520874,
"num_tokens": 20071044.0,
"step": 170
},
{
"epoch": 0.09738452977184196,
"grad_norm": 2.735490789384101,
"learning_rate": 1.9333333333333333e-05,
"loss": 1.0858,
"mean_token_accuracy": 0.7050360441207886,
"num_tokens": 20662778.0,
"step": 175
},
{
"epoch": 0.1001669449081803,
"grad_norm": 2.7739191020812655,
"learning_rate": 1.988888888888889e-05,
"loss": 1.0681,
"mean_token_accuracy": 0.7093758583068848,
"num_tokens": 21253812.0,
"step": 180
},
{
"epoch": 0.10294936004451864,
"grad_norm": 2.8028592827713363,
"learning_rate": 1.9999698027421894e-05,
"loss": 1.0702,
"mean_token_accuracy": 0.7094247817993165,
"num_tokens": 21843322.0,
"step": 185
},
{
"epoch": 0.10573177518085698,
"grad_norm": 2.595713944513709,
"learning_rate": 1.9998471295079908e-05,
"loss": 1.0458,
"mean_token_accuracy": 0.7138230800628662,
"num_tokens": 22433061.0,
"step": 190
},
{
"epoch": 0.10851419031719532,
"grad_norm": 2.7873986833719946,
"learning_rate": 1.9996301045360874e-05,
"loss": 1.0974,
"mean_token_accuracy": 0.704916775226593,
"num_tokens": 23021153.0,
"step": 195
},
{
"epoch": 0.11129660545353366,
"grad_norm": 2.439806161655261,
"learning_rate": 1.9993187483062935e-05,
"loss": 1.0771,
"mean_token_accuracy": 0.7064628720283508,
"num_tokens": 23609275.0,
"step": 200
},
{
"epoch": 0.11407902058987202,
"grad_norm": 2.4957981823187763,
"learning_rate": 1.9989130902001025e-05,
"loss": 1.0917,
"mean_token_accuracy": 0.7053624391555786,
"num_tokens": 24205073.0,
"step": 205
},
{
"epoch": 0.11686143572621036,
"grad_norm": 2.896823560689145,
"learning_rate": 1.9984131684979134e-05,
"loss": 1.1004,
"mean_token_accuracy": 0.7049420475959778,
"num_tokens": 24800240.0,
"step": 210
},
{
"epoch": 0.1196438508625487,
"grad_norm": 2.5192627022618623,
"learning_rate": 1.997819030375419e-05,
"loss": 1.0623,
"mean_token_accuracy": 0.7119413614273071,
"num_tokens": 25387549.0,
"step": 215
},
{
"epoch": 0.12242626599888703,
"grad_norm": 2.8132540932895695,
"learning_rate": 1.9971307318991546e-05,
"loss": 1.0915,
"mean_token_accuracy": 0.7074636220932007,
"num_tokens": 25965133.0,
"step": 220
},
{
"epoch": 0.12520868113522537,
"grad_norm": 35.333375177867765,
"learning_rate": 1.996348338021207e-05,
"loss": 1.114,
"mean_token_accuracy": 0.700543737411499,
"num_tokens": 26563977.0,
"step": 225
},
{
"epoch": 0.12799109627156371,
"grad_norm": 2.920196605955216,
"learning_rate": 1.9954719225730847e-05,
"loss": 1.1139,
"mean_token_accuracy": 0.7011779904365539,
"num_tokens": 27156932.0,
"step": 230
},
{
"epoch": 0.13077351140790205,
"grad_norm": 2.60574074107156,
"learning_rate": 1.9945015682587512e-05,
"loss": 1.096,
"mean_token_accuracy": 0.7043320059776306,
"num_tokens": 27754019.0,
"step": 235
},
{
"epoch": 0.1335559265442404,
"grad_norm": 2.865631892072952,
"learning_rate": 1.9934373666468203e-05,
"loss": 1.0804,
"mean_token_accuracy": 0.706881308555603,
"num_tokens": 28342275.0,
"step": 240
},
{
"epoch": 0.13633834168057873,
"grad_norm": 2.504324070572632,
"learning_rate": 1.992279418161915e-05,
"loss": 1.099,
"mean_token_accuracy": 0.7036979913711547,
"num_tokens": 28928534.0,
"step": 245
},
{
"epoch": 0.13912075681691707,
"grad_norm": 2.6138293066655045,
"learning_rate": 1.991027832075192e-05,
"loss": 1.0921,
"mean_token_accuracy": 0.7047542929649353,
"num_tokens": 29513990.0,
"step": 250
},
{
"epoch": 0.1419031719532554,
"grad_norm": 2.3711537074020037,
"learning_rate": 1.989682726494028e-05,
"loss": 1.0562,
"mean_token_accuracy": 0.7139820337295533,
"num_tokens": 30113881.0,
"step": 255
},
{
"epoch": 0.14468558708959378,
"grad_norm": 2.338233428322228,
"learning_rate": 1.988244228350877e-05,
"loss": 1.0811,
"mean_token_accuracy": 0.7071714401245117,
"num_tokens": 30700467.0,
"step": 260
},
{
"epoch": 0.14746800222593212,
"grad_norm": 2.5175646612189655,
"learning_rate": 1.986712473391289e-05,
"loss": 1.0979,
"mean_token_accuracy": 0.7044062852859497,
"num_tokens": 31292719.0,
"step": 265
},
{
"epoch": 0.15025041736227046,
"grad_norm": 2.40596554930329,
"learning_rate": 1.9850876061611036e-05,
"loss": 1.092,
"mean_token_accuracy": 0.706499433517456,
"num_tokens": 31883435.0,
"step": 270
},
{
"epoch": 0.1530328324986088,
"grad_norm": 2.4796442221955384,
"learning_rate": 1.9833697799928074e-05,
"loss": 1.0967,
"mean_token_accuracy": 0.7027202010154724,
"num_tokens": 32481693.0,
"step": 275
},
{
"epoch": 0.15581524763494714,
"grad_norm": 2.4780125377517663,
"learning_rate": 1.9815591569910654e-05,
"loss": 1.1121,
"mean_token_accuracy": 0.7006395697593689,
"num_tokens": 33071205.0,
"step": 280
},
{
"epoch": 0.15859766277128548,
"grad_norm": 2.3510385258823976,
"learning_rate": 1.979655908017424e-05,
"loss": 1.0861,
"mean_token_accuracy": 0.7057282090187073,
"num_tokens": 33671052.0,
"step": 285
},
{
"epoch": 0.16138007790762382,
"grad_norm": 2.5067301813131655,
"learning_rate": 1.9776602126741867e-05,
"loss": 1.0807,
"mean_token_accuracy": 0.7070404767990113,
"num_tokens": 34260518.0,
"step": 290
},
{
"epoch": 0.16416249304396216,
"grad_norm": 2.28351268206912,
"learning_rate": 1.975572259287467e-05,
"loss": 1.0803,
"mean_token_accuracy": 0.7072898864746093,
"num_tokens": 34848727.0,
"step": 295
},
{
"epoch": 0.1669449081803005,
"grad_norm": 2.7544633183153926,
"learning_rate": 1.973392244889415e-05,
"loss": 1.0854,
"mean_token_accuracy": 0.7059407949447631,
"num_tokens": 35437641.0,
"step": 300
},
{
"epoch": 0.16972732331663884,
"grad_norm": 2.3830000818389925,
"learning_rate": 1.9711203751996267e-05,
"loss": 1.0988,
"mean_token_accuracy": 0.7048187136650086,
"num_tokens": 36037300.0,
"step": 305
},
{
"epoch": 0.17250973845297718,
"grad_norm": 2.360850123530632,
"learning_rate": 1.9687568646057277e-05,
"loss": 1.0736,
"mean_token_accuracy": 0.7092967867851258,
"num_tokens": 36629457.0,
"step": 310
},
{
"epoch": 0.17529215358931552,
"grad_norm": 2.2707504412701582,
"learning_rate": 1.966301936143146e-05,
"loss": 1.0958,
"mean_token_accuracy": 0.7042588949203491,
"num_tokens": 37224522.0,
"step": 315
},
{
"epoch": 0.17807456872565386,
"grad_norm": 2.201925130311423,
"learning_rate": 1.9637558214740618e-05,
"loss": 1.0964,
"mean_token_accuracy": 0.7053308248519897,
"num_tokens": 37815851.0,
"step": 320
},
{
"epoch": 0.1808569838619922,
"grad_norm": 2.2178593995634697,
"learning_rate": 1.9611187608655484e-05,
"loss": 1.1105,
"mean_token_accuracy": 0.7033011674880981,
"num_tokens": 38396495.0,
"step": 325
},
{
"epoch": 0.18363939899833054,
"grad_norm": 2.2136118629553185,
"learning_rate": 1.9583910031668984e-05,
"loss": 1.0862,
"mean_token_accuracy": 0.7051831126213074,
"num_tokens": 38994173.0,
"step": 330
},
{
"epoch": 0.1864218141346689,
"grad_norm": 2.292573884422775,
"learning_rate": 1.955572805786141e-05,
"loss": 1.1233,
"mean_token_accuracy": 0.7006201148033142,
"num_tokens": 39577921.0,
"step": 335
},
{
"epoch": 0.18920422927100725,
"grad_norm": 2.2980324001779873,
"learning_rate": 1.9526644346657508e-05,
"loss": 1.1007,
"mean_token_accuracy": 0.7046499371528625,
"num_tokens": 40170313.0,
"step": 340
},
{
"epoch": 0.19198664440734559,
"grad_norm": 2.2611492444425916,
"learning_rate": 1.9496661642575517e-05,
"loss": 1.065,
"mean_token_accuracy": 0.7105947017669678,
"num_tokens": 40765429.0,
"step": 345
},
{
"epoch": 0.19476905954368393,
"grad_norm": 2.1512493790533185,
"learning_rate": 1.946578277496821e-05,
"loss": 1.0917,
"mean_token_accuracy": 0.7073456883430481,
"num_tokens": 41365863.0,
"step": 350
},
{
"epoch": 0.19755147468002227,
"grad_norm": 2.271847848176885,
"learning_rate": 1.943401065775584e-05,
"loss": 1.1011,
"mean_token_accuracy": 0.7054303050041199,
"num_tokens": 41957368.0,
"step": 355
},
{
"epoch": 0.2003338898163606,
"grad_norm": 2.222128508786771,
"learning_rate": 1.940134828915123e-05,
"loss": 1.1086,
"mean_token_accuracy": 0.7034181118011474,
"num_tokens": 42546312.0,
"step": 360
},
{
"epoch": 0.20311630495269895,
"grad_norm": 2.159135050960954,
"learning_rate": 1.936779875137678e-05,
"loss": 1.0821,
"mean_token_accuracy": 0.707557737827301,
"num_tokens": 43141094.0,
"step": 365
},
{
"epoch": 0.20589872008903728,
"grad_norm": 2.2055966042491546,
"learning_rate": 1.9333365210373668e-05,
"loss": 1.0902,
"mean_token_accuracy": 0.705539345741272,
"num_tokens": 43731225.0,
"step": 370
},
{
"epoch": 0.20868113522537562,
"grad_norm": 2.1858790427306785,
"learning_rate": 1.9298050915503053e-05,
"loss": 1.1066,
"mean_token_accuracy": 0.7038124799728394,
"num_tokens": 44311286.0,
"step": 375
},
{
"epoch": 0.21146355036171396,
"grad_norm": 2.204033703125247,
"learning_rate": 1.926185919923946e-05,
"loss": 1.0971,
"mean_token_accuracy": 0.7054288148880005,
"num_tokens": 44906755.0,
"step": 380
},
{
"epoch": 0.2142459654980523,
"grad_norm": 2.2598831388779694,
"learning_rate": 1.9224793476856293e-05,
"loss": 1.1201,
"mean_token_accuracy": 0.700083589553833,
"num_tokens": 45491095.0,
"step": 385
},
{
"epoch": 0.21702838063439064,
"grad_norm": 2.1902124071852977,
"learning_rate": 1.9186857246103586e-05,
"loss": 1.079,
"mean_token_accuracy": 0.7079141974449158,
"num_tokens": 46084794.0,
"step": 390
},
{
"epoch": 0.21981079577072898,
"grad_norm": 2.225166806223554,
"learning_rate": 1.9148054086877884e-05,
"loss": 1.0965,
"mean_token_accuracy": 0.7044667720794677,
"num_tokens": 46674587.0,
"step": 395
},
{
"epoch": 0.22259321090706732,
"grad_norm": 2.110266160830183,
"learning_rate": 1.9108387660884456e-05,
"loss": 1.1019,
"mean_token_accuracy": 0.7042423367500306,
"num_tokens": 47263613.0,
"step": 400
},
{
"epoch": 0.22537562604340566,
"grad_norm": 2.137008460009227,
"learning_rate": 1.9067861711291744e-05,
"loss": 1.0984,
"mean_token_accuracy": 0.7045777201652527,
"num_tokens": 47848405.0,
"step": 405
},
{
"epoch": 0.22815804117974403,
"grad_norm": 2.5090054849409573,
"learning_rate": 1.9026480062378136e-05,
"loss": 1.1232,
"mean_token_accuracy": 0.7006029844284057,
"num_tokens": 48440420.0,
"step": 410
},
{
"epoch": 0.23094045631608237,
"grad_norm": 2.5298900029195934,
"learning_rate": 1.8984246619171075e-05,
"loss": 1.0998,
"mean_token_accuracy": 0.7040575265884399,
"num_tokens": 49026577.0,
"step": 415
},
{
"epoch": 0.2337228714524207,
"grad_norm": 2.243788572378369,
"learning_rate": 1.894116536707857e-05,
"loss": 1.0931,
"mean_token_accuracy": 0.7059786558151245,
"num_tokens": 49618303.0,
"step": 420
},
{
"epoch": 0.23650528658875905,
"grad_norm": 2.2973971910882853,
"learning_rate": 1.8897240371513098e-05,
"loss": 1.1076,
"mean_token_accuracy": 0.7032187581062317,
"num_tokens": 50211716.0,
"step": 425
},
{
"epoch": 0.2392877017250974,
"grad_norm": 2.121159415163043,
"learning_rate": 1.8852475777507983e-05,
"loss": 1.0882,
"mean_token_accuracy": 0.7079625129699707,
"num_tokens": 50806268.0,
"step": 430
},
{
"epoch": 0.24207011686143573,
"grad_norm": 2.2653088399729593,
"learning_rate": 1.8806875809326204e-05,
"loss": 1.0988,
"mean_token_accuracy": 0.7044902324676514,
"num_tokens": 51395551.0,
"step": 435
},
{
"epoch": 0.24485253199777407,
"grad_norm": 2.0640694310319647,
"learning_rate": 1.876044477006183e-05,
"loss": 1.1057,
"mean_token_accuracy": 0.7019346117973327,
"num_tokens": 51988430.0,
"step": 440
},
{
"epoch": 0.2476349471341124,
"grad_norm": 2.1331480343408225,
"learning_rate": 1.8713187041233896e-05,
"loss": 1.0845,
"mean_token_accuracy": 0.7060743689537048,
"num_tokens": 52583147.0,
"step": 445
},
{
"epoch": 0.25041736227045075,
"grad_norm": 2.4013534842444186,
"learning_rate": 1.866510708237297e-05,
"loss": 1.0979,
"mean_token_accuracy": 0.7047066450119018,
"num_tokens": 53181352.0,
"step": 450
},
{
"epoch": 0.2531997774067891,
"grad_norm": 2.3023711179533226,
"learning_rate": 1.861620943060031e-05,
"loss": 1.1275,
"mean_token_accuracy": 0.6983560442924499,
"num_tokens": 53772836.0,
"step": 455
},
{
"epoch": 0.25598219254312743,
"grad_norm": 2.2577981667782208,
"learning_rate": 1.856649870019972e-05,
"loss": 1.0957,
"mean_token_accuracy": 0.7056548476219178,
"num_tokens": 54367700.0,
"step": 460
},
{
"epoch": 0.2587646076794658,
"grad_norm": 2.676938686558113,
"learning_rate": 1.8515979582182112e-05,
"loss": 1.0906,
"mean_token_accuracy": 0.707176685333252,
"num_tokens": 54960810.0,
"step": 465
},
{
"epoch": 0.2615470228158041,
"grad_norm": 2.4165806936926133,
"learning_rate": 1.8464656843842837e-05,
"loss": 1.0897,
"mean_token_accuracy": 0.7070010900497437,
"num_tokens": 55550003.0,
"step": 470
},
{
"epoch": 0.2643294379521425,
"grad_norm": 2.461649569325264,
"learning_rate": 1.8412535328311813e-05,
"loss": 1.1121,
"mean_token_accuracy": 0.7028052568435669,
"num_tokens": 56136218.0,
"step": 475
},
{
"epoch": 0.2671118530884808,
"grad_norm": 2.1804919862847365,
"learning_rate": 1.8359619954096497e-05,
"loss": 1.1076,
"mean_token_accuracy": 0.7032665610313416,
"num_tokens": 56726599.0,
"step": 480
},
{
"epoch": 0.26989426822481916,
"grad_norm": 2.68554258980135,
"learning_rate": 1.8305915714617745e-05,
"loss": 1.0993,
"mean_token_accuracy": 0.7033315062522888,
"num_tokens": 57321297.0,
"step": 485
},
{
"epoch": 0.27267668336115747,
"grad_norm": 2.3164265508749544,
"learning_rate": 1.8251427677738596e-05,
"loss": 1.067,
"mean_token_accuracy": 0.710555636882782,
"num_tokens": 57913003.0,
"step": 490
},
{
"epoch": 0.27545909849749584,
"grad_norm": 2.25329369986598,
"learning_rate": 1.8196160985286052e-05,
"loss": 1.0913,
"mean_token_accuracy": 0.708107590675354,
"num_tokens": 58499228.0,
"step": 495
},
{
"epoch": 0.27824151363383415,
"grad_norm": 2.2429809416951656,
"learning_rate": 1.814012085256585e-05,
"loss": 1.0993,
"mean_token_accuracy": 0.7040925621986389,
"num_tokens": 59090708.0,
"step": 500
},
{
"epoch": 0.2810239287701725,
"grad_norm": 2.2073249069907384,
"learning_rate": 1.8083312567870315e-05,
"loss": 1.0879,
"mean_token_accuracy": 0.7081225514411926,
"num_tokens": 59685930.0,
"step": 505
},
{
"epoch": 0.2838063439065108,
"grad_norm": 2.0857359362520214,
"learning_rate": 1.8025741491979326e-05,
"loss": 1.0616,
"mean_token_accuracy": 0.7111712694168091,
"num_tokens": 60280434.0,
"step": 510
},
{
"epoch": 0.2865887590428492,
"grad_norm": 2.163016097659582,
"learning_rate": 1.7967413057654452e-05,
"loss": 1.0775,
"mean_token_accuracy": 0.7096009373664856,
"num_tokens": 60868682.0,
"step": 515
},
{
"epoch": 0.28937117417918756,
"grad_norm": 2.3117437321963306,
"learning_rate": 1.7908332769126255e-05,
"loss": 1.1076,
"mean_token_accuracy": 0.7027746677398682,
"num_tokens": 61458691.0,
"step": 520
},
{
"epoch": 0.2921535893155259,
"grad_norm": 2.4449046203168856,
"learning_rate": 1.784850620157491e-05,
"loss": 1.0963,
"mean_token_accuracy": 0.7077104687690735,
"num_tokens": 62050298.0,
"step": 525
},
{
"epoch": 0.29493600445186424,
"grad_norm": 2.2894244206650574,
"learning_rate": 1.7787939000604063e-05,
"loss": 1.074,
"mean_token_accuracy": 0.709146237373352,
"num_tokens": 62641275.0,
"step": 530
},
{
"epoch": 0.29771841958820255,
"grad_norm": 2.235175132800985,
"learning_rate": 1.7726636881708114e-05,
"loss": 1.0921,
"mean_token_accuracy": 0.7072658061981201,
"num_tokens": 63230436.0,
"step": 535
},
{
"epoch": 0.3005008347245409,
"grad_norm": 2.1669106168397665,
"learning_rate": 1.7664605629732832e-05,
"loss": 1.0954,
"mean_token_accuracy": 0.7046370029449462,
"num_tokens": 63818119.0,
"step": 540
},
{
"epoch": 0.30328324986087923,
"grad_norm": 2.1441378931657478,
"learning_rate": 1.7601851098329484e-05,
"loss": 1.0671,
"mean_token_accuracy": 0.710686981678009,
"num_tokens": 64410016.0,
"step": 545
},
{
"epoch": 0.3060656649972176,
"grad_norm": 2.124149203635388,
"learning_rate": 1.7538379209402442e-05,
"loss": 1.0893,
"mean_token_accuracy": 0.7079866886138916,
"num_tokens": 65008878.0,
"step": 550
},
{
"epoch": 0.3088480801335559,
"grad_norm": 2.092451847928283,
"learning_rate": 1.7474195952550355e-05,
"loss": 1.0911,
"mean_token_accuracy": 0.7058361053466797,
"num_tokens": 65591920.0,
"step": 555
},
{
"epoch": 0.3116304952698943,
"grad_norm": 1.979401747526117,
"learning_rate": 1.7409307384500932e-05,
"loss": 1.0781,
"mean_token_accuracy": 0.7093043208122254,
"num_tokens": 66183326.0,
"step": 560
},
{
"epoch": 0.3144129104062326,
"grad_norm": 2.3013535796680133,
"learning_rate": 1.7343719628539396e-05,
"loss": 1.1062,
"mean_token_accuracy": 0.7034829258918762,
"num_tokens": 66770419.0,
"step": 565
},
{
"epoch": 0.31719532554257096,
"grad_norm": 2.232144210677623,
"learning_rate": 1.7277438873930654e-05,
"loss": 1.0888,
"mean_token_accuracy": 0.7088476419448853,
"num_tokens": 67356232.0,
"step": 570
},
{
"epoch": 0.3199777406789093,
"grad_norm": 2.2615310284916426,
"learning_rate": 1.7210471375335225e-05,
"loss": 1.0762,
"mean_token_accuracy": 0.7096709370613098,
"num_tokens": 67948261.0,
"step": 575
},
{
"epoch": 0.32276015581524764,
"grad_norm": 2.069623037881395,
"learning_rate": 1.7142823452219036e-05,
"loss": 1.0584,
"mean_token_accuracy": 0.7133225679397583,
"num_tokens": 68530816.0,
"step": 580
},
{
"epoch": 0.32554257095158595,
"grad_norm": 2.01949110828742,
"learning_rate": 1.7074501488257062e-05,
"loss": 1.0771,
"mean_token_accuracy": 0.7082255363464356,
"num_tokens": 69121402.0,
"step": 585
},
{
"epoch": 0.3283249860879243,
"grad_norm": 2.114712605110562,
"learning_rate": 1.700551193073092e-05,
"loss": 1.0434,
"mean_token_accuracy": 0.7137895464897156,
"num_tokens": 69707900.0,
"step": 590
},
{
"epoch": 0.3311074012242627,
"grad_norm": 2.1204083275143533,
"learning_rate": 1.693586128992048e-05,
"loss": 1.0753,
"mean_token_accuracy": 0.7090141296386718,
"num_tokens": 70297299.0,
"step": 595
},
{
"epoch": 0.333889816360601,
"grad_norm": 2.170713633905745,
"learning_rate": 1.6865556138489497e-05,
"loss": 1.0944,
"mean_token_accuracy": 0.706296420097351,
"num_tokens": 70886257.0,
"step": 600
},
{
"epoch": 0.33667223149693937,
"grad_norm": 2.1428343367458074,
"learning_rate": 1.6794603110865396e-05,
"loss": 1.0871,
"mean_token_accuracy": 0.7076637268066406,
"num_tokens": 71474356.0,
"step": 605
},
{
"epoch": 0.3394546466332777,
"grad_norm": 3.3075272432648886,
"learning_rate": 1.672300890261317e-05,
"loss": 1.044,
"mean_token_accuracy": 0.7172706961631775,
"num_tokens": 72059816.0,
"step": 610
},
{
"epoch": 0.34223706176961605,
"grad_norm": 2.1270286082703573,
"learning_rate": 1.6650780269803587e-05,
"loss": 1.0863,
"mean_token_accuracy": 0.7074844360351562,
"num_tokens": 72652774.0,
"step": 615
},
{
"epoch": 0.34501947690595436,
"grad_norm": 2.100968885315603,
"learning_rate": 1.6577924028375622e-05,
"loss": 1.0677,
"mean_token_accuracy": 0.71006840467453,
"num_tokens": 73239819.0,
"step": 620
},
{
"epoch": 0.3478018920422927,
"grad_norm": 2.008632532868866,
"learning_rate": 1.6504447053493264e-05,
"loss": 1.0645,
"mean_token_accuracy": 0.7101643443107605,
"num_tokens": 73831159.0,
"step": 625
},
{
"epoch": 0.35058430717863104,
"grad_norm": 2.0858240283477545,
"learning_rate": 1.643035627889674e-05,
"loss": 1.0717,
"mean_token_accuracy": 0.7094730496406555,
"num_tokens": 74422688.0,
"step": 630
},
{
"epoch": 0.3533667223149694,
"grad_norm": 2.1384864549062197,
"learning_rate": 1.63556586962482e-05,
"loss": 1.1,
"mean_token_accuracy": 0.7050098419189453,
"num_tokens": 75009215.0,
"step": 635
},
{
"epoch": 0.3561491374513077,
"grad_norm": 2.106464767907768,
"learning_rate": 1.628036135447194e-05,
"loss": 1.0894,
"mean_token_accuracy": 0.707071328163147,
"num_tokens": 75598228.0,
"step": 640
},
{
"epoch": 0.3589315525876461,
"grad_norm": 2.0551179685857144,
"learning_rate": 1.6204471359089224e-05,
"loss": 1.0785,
"mean_token_accuracy": 0.7078182816505432,
"num_tokens": 76186740.0,
"step": 645
},
{
"epoch": 0.3617139677239844,
"grad_norm": 2.1646737527032314,
"learning_rate": 1.612799587154777e-05,
"loss": 1.07,
"mean_token_accuracy": 0.7111572623252869,
"num_tokens": 76774832.0,
"step": 650
},
{
"epoch": 0.36449638286032277,
"grad_norm": 2.104548936492404,
"learning_rate": 1.6050942108545938e-05,
"loss": 1.0747,
"mean_token_accuracy": 0.7105032086372376,
"num_tokens": 77363315.0,
"step": 655
},
{
"epoch": 0.3672787979966611,
"grad_norm": 2.0651187265677216,
"learning_rate": 1.5973317341351725e-05,
"loss": 1.0697,
"mean_token_accuracy": 0.7097868919372559,
"num_tokens": 77951799.0,
"step": 660
},
{
"epoch": 0.37006121313299944,
"grad_norm": 1.9772941125582544,
"learning_rate": 1.58951288951166e-05,
"loss": 1.0703,
"mean_token_accuracy": 0.7106229305267334,
"num_tokens": 78551404.0,
"step": 665
},
{
"epoch": 0.3728436282693378,
"grad_norm": 2.081741954302233,
"learning_rate": 1.5816384148184273e-05,
"loss": 1.0564,
"mean_token_accuracy": 0.7130509853363037,
"num_tokens": 79148333.0,
"step": 670
},
{
"epoch": 0.3756260434056761,
"grad_norm": 2.116111905318221,
"learning_rate": 1.57370905313944e-05,
"loss": 1.0901,
"mean_token_accuracy": 0.7071909785270691,
"num_tokens": 79731743.0,
"step": 675
},
{
"epoch": 0.3784084585420145,
"grad_norm": 2.010006167198628,
"learning_rate": 1.5657255527381395e-05,
"loss": 1.0741,
"mean_token_accuracy": 0.7091400980949402,
"num_tokens": 80332028.0,
"step": 680
},
{
"epoch": 0.3811908736783528,
"grad_norm": 2.0107259580616956,
"learning_rate": 1.5576886669868297e-05,
"loss": 1.0492,
"mean_token_accuracy": 0.7131890416145324,
"num_tokens": 80923863.0,
"step": 685
},
{
"epoch": 0.38397328881469117,
"grad_norm": 1.9791050603322653,
"learning_rate": 1.5495991542955855e-05,
"loss": 1.0503,
"mean_token_accuracy": 0.7160694479942322,
"num_tokens": 81512560.0,
"step": 690
},
{
"epoch": 0.3867557039510295,
"grad_norm": 1.9490662087580195,
"learning_rate": 1.541457778040684e-05,
"loss": 1.0529,
"mean_token_accuracy": 0.7135980725288391,
"num_tokens": 82097379.0,
"step": 695
},
{
"epoch": 0.38953811908736785,
"grad_norm": 2.0179615858909377,
"learning_rate": 1.5332653064925683e-05,
"loss": 1.0519,
"mean_token_accuracy": 0.7147277235984802,
"num_tokens": 82685268.0,
"step": 700
},
{
"epoch": 0.39232053422370616,
"grad_norm": 2.0450453857514175,
"learning_rate": 1.5250225127433485e-05,
"loss": 1.043,
"mean_token_accuracy": 0.7144908547401428,
"num_tokens": 83277230.0,
"step": 705
},
{
"epoch": 0.39510294936004453,
"grad_norm": 1.9276469987768912,
"learning_rate": 1.5167301746338466e-05,
"loss": 1.0784,
"mean_token_accuracy": 0.7108999609947204,
"num_tokens": 83861406.0,
"step": 710
},
{
"epoch": 0.39788536449638284,
"grad_norm": 1.9433023381927899,
"learning_rate": 1.5083890746801962e-05,
"loss": 1.0597,
"mean_token_accuracy": 0.7121692419052124,
"num_tokens": 84459146.0,
"step": 715
},
{
"epoch": 0.4006677796327212,
"grad_norm": 1.9546874431863348,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.0919,
"mean_token_accuracy": 0.7079582571983337,
"num_tokens": 85049969.0,
"step": 720
},
{
"epoch": 0.4034501947690595,
"grad_norm": 1.9563600752313475,
"learning_rate": 1.491563742238051e-05,
"loss": 1.0692,
"mean_token_accuracy": 0.7110470652580261,
"num_tokens": 85638302.0,
"step": 725
},
{
"epoch": 0.4062326099053979,
"grad_norm": 1.9290273436145757,
"learning_rate": 1.483081097491628e-05,
"loss": 1.0697,
"mean_token_accuracy": 0.7116230726242065,
"num_tokens": 86229643.0,
"step": 730
},
{
"epoch": 0.4090150250417362,
"grad_norm": 1.9858611430728594,
"learning_rate": 1.4745528662353728e-05,
"loss": 1.0483,
"mean_token_accuracy": 0.7151864290237426,
"num_tokens": 86822104.0,
"step": 735
},
{
"epoch": 0.41179744017807457,
"grad_norm": 2.04642483648034,
"learning_rate": 1.4659798532457497e-05,
"loss": 1.0775,
"mean_token_accuracy": 0.7090552926063538,
"num_tokens": 87413792.0,
"step": 740
},
{
"epoch": 0.41457985531441294,
"grad_norm": 1.9893794431614882,
"learning_rate": 1.4573628675251051e-05,
"loss": 1.05,
"mean_token_accuracy": 0.7146545886993408,
"num_tokens": 88001772.0,
"step": 745
},
{
"epoch": 0.41736227045075125,
"grad_norm": 1.9071569454284205,
"learning_rate": 1.4487027222253216e-05,
"loss": 1.071,
"mean_token_accuracy": 0.7112080335617066,
"num_tokens": 88586368.0,
"step": 750
},
{
"epoch": 0.4201446855870896,
"grad_norm": 2.1261499946440106,
"learning_rate": 1.4400002345710871e-05,
"loss": 1.053,
"mean_token_accuracy": 0.7138799786567688,
"num_tokens": 89169649.0,
"step": 755
},
{
"epoch": 0.42292710072342793,
"grad_norm": 2.0601232013853354,
"learning_rate": 1.4312562257827742e-05,
"loss": 1.0592,
"mean_token_accuracy": 0.7137506484985352,
"num_tokens": 89758883.0,
"step": 760
},
{
"epoch": 0.4257095158597663,
"grad_norm": 2.0053263574303126,
"learning_rate": 1.4224715209989463e-05,
"loss": 1.0762,
"mean_token_accuracy": 0.7106667995452881,
"num_tokens": 90343260.0,
"step": 765
},
{
"epoch": 0.4284919309961046,
"grad_norm": 2.040531463590581,
"learning_rate": 1.4136469491984913e-05,
"loss": 1.0532,
"mean_token_accuracy": 0.7144197583198547,
"num_tokens": 90931881.0,
"step": 770
},
{
"epoch": 0.431274346132443,
"grad_norm": 2.1902642918655353,
"learning_rate": 1.4047833431223938e-05,
"loss": 1.0688,
"mean_token_accuracy": 0.7094583511352539,
"num_tokens": 91515784.0,
"step": 775
},
{
"epoch": 0.4340567612687813,
"grad_norm": 2.128849928536235,
"learning_rate": 1.3958815391951552e-05,
"loss": 1.0675,
"mean_token_accuracy": 0.7103721380233765,
"num_tokens": 92113098.0,
"step": 780
},
{
"epoch": 0.43683917640511966,
"grad_norm": 1.9500214149444213,
"learning_rate": 1.3869423774458594e-05,
"loss": 1.0728,
"mean_token_accuracy": 0.7097015857696534,
"num_tokens": 92709566.0,
"step": 785
},
{
"epoch": 0.43962159154145797,
"grad_norm": 1.8764892174897658,
"learning_rate": 1.3779667014289067e-05,
"loss": 1.0431,
"mean_token_accuracy": 0.7169391632080078,
"num_tokens": 93292537.0,
"step": 790
},
{
"epoch": 0.44240400667779634,
"grad_norm": 2.10238424716131,
"learning_rate": 1.3689553581444069e-05,
"loss": 1.0145,
"mean_token_accuracy": 0.7227911353111267,
"num_tokens": 93878784.0,
"step": 795
},
{
"epoch": 0.44518642181413465,
"grad_norm": 2.0641117161806033,
"learning_rate": 1.3599091979582537e-05,
"loss": 1.0576,
"mean_token_accuracy": 0.7129832863807678,
"num_tokens": 94467072.0,
"step": 800
},
{
"epoch": 0.447968836950473,
"grad_norm": 2.0209262573363143,
"learning_rate": 1.3508290745218789e-05,
"loss": 1.0281,
"mean_token_accuracy": 0.7192481160163879,
"num_tokens": 95055139.0,
"step": 805
},
{
"epoch": 0.4507512520868113,
"grad_norm": 1.9960033545510802,
"learning_rate": 1.341715844691695e-05,
"loss": 1.0381,
"mean_token_accuracy": 0.7170910716056824,
"num_tokens": 95643923.0,
"step": 810
},
{
"epoch": 0.4535336672231497,
"grad_norm": 2.666908392602128,
"learning_rate": 1.3325703684482383e-05,
"loss": 1.0911,
"mean_token_accuracy": 0.7066366791725158,
"num_tokens": 96229319.0,
"step": 815
},
{
"epoch": 0.45631608235948806,
"grad_norm": 1.9412627867610472,
"learning_rate": 1.3233935088150154e-05,
"loss": 1.044,
"mean_token_accuracy": 0.7168261289596558,
"num_tokens": 96825493.0,
"step": 820
},
{
"epoch": 0.4590984974958264,
"grad_norm": 1.8919553931638313,
"learning_rate": 1.3141861317770628e-05,
"loss": 1.0856,
"mean_token_accuracy": 0.708315372467041,
"num_tokens": 97415636.0,
"step": 825
},
{
"epoch": 0.46188091263216474,
"grad_norm": 1.979892135995854,
"learning_rate": 1.3049491061992274e-05,
"loss": 1.0411,
"mean_token_accuracy": 0.716647469997406,
"num_tokens": 98008396.0,
"step": 830
},
{
"epoch": 0.46466332776850305,
"grad_norm": 1.941207079118909,
"learning_rate": 1.2956833037441756e-05,
"loss": 1.0489,
"mean_token_accuracy": 0.7146740078926086,
"num_tokens": 98593026.0,
"step": 835
},
{
"epoch": 0.4674457429048414,
"grad_norm": 2.184633389681683,
"learning_rate": 1.2863895987901364e-05,
"loss": 1.0746,
"mean_token_accuracy": 0.7111501693725586,
"num_tokens": 99185818.0,
"step": 840
},
{
"epoch": 0.47022815804117973,
"grad_norm": 1.9997670534792031,
"learning_rate": 1.2770688683483914e-05,
"loss": 1.0701,
"mean_token_accuracy": 0.708341383934021,
"num_tokens": 99774152.0,
"step": 845
},
{
"epoch": 0.4730105731775181,
"grad_norm": 2.0094775586736953,
"learning_rate": 1.2677219919805137e-05,
"loss": 1.0455,
"mean_token_accuracy": 0.7151992082595825,
"num_tokens": 100363649.0,
"step": 850
},
{
"epoch": 0.4757929883138564,
"grad_norm": 2.159247041588428,
"learning_rate": 1.2583498517153662e-05,
"loss": 1.0338,
"mean_token_accuracy": 0.7189494609832764,
"num_tokens": 100957067.0,
"step": 855
},
{
"epoch": 0.4785754034501948,
"grad_norm": 2.028288393918697,
"learning_rate": 1.2489533319658703e-05,
"loss": 1.0394,
"mean_token_accuracy": 0.7162809491157531,
"num_tokens": 101549170.0,
"step": 860
},
{
"epoch": 0.4813578185865331,
"grad_norm": 2.03306725822367,
"learning_rate": 1.2395333194455444e-05,
"loss": 1.0468,
"mean_token_accuracy": 0.7151136279106141,
"num_tokens": 102142380.0,
"step": 865
},
{
"epoch": 0.48414023372287146,
"grad_norm": 2.1682696530364374,
"learning_rate": 1.2300907030848307e-05,
"loss": 1.0554,
"mean_token_accuracy": 0.7153695344924926,
"num_tokens": 102734295.0,
"step": 870
},
{
"epoch": 0.4869226488592098,
"grad_norm": 2.019722631206529,
"learning_rate": 1.2206263739472085e-05,
"loss": 1.0397,
"mean_token_accuracy": 0.7160439848899841,
"num_tokens": 103319783.0,
"step": 875
},
{
"epoch": 0.48970506399554814,
"grad_norm": 1.9817132841610883,
"learning_rate": 1.2111412251451085e-05,
"loss": 1.0487,
"mean_token_accuracy": 0.7163015246391297,
"num_tokens": 103911953.0,
"step": 880
},
{
"epoch": 0.49248747913188645,
"grad_norm": 1.924001725795815,
"learning_rate": 1.2016361517556334e-05,
"loss": 1.0267,
"mean_token_accuracy": 0.7179745554924011,
"num_tokens": 104499490.0,
"step": 885
},
{
"epoch": 0.4952698942682248,
"grad_norm": 1.9061990735413328,
"learning_rate": 1.1921120507360934e-05,
"loss": 1.0194,
"mean_token_accuracy": 0.721126937866211,
"num_tokens": 105087086.0,
"step": 890
},
{
"epoch": 0.4980523094045632,
"grad_norm": 2.0609228249311635,
"learning_rate": 1.182569820839362e-05,
"loss": 1.0241,
"mean_token_accuracy": 0.7190962195396423,
"num_tokens": 105676890.0,
"step": 895
},
{
"epoch": 0.5008347245409015,
"grad_norm": 2.136985315971352,
"learning_rate": 1.1730103625290658e-05,
"loss": 1.0727,
"mean_token_accuracy": 0.7091086864471435,
"num_tokens": 106260405.0,
"step": 900
},
{
"epoch": 0.5036171396772399,
"grad_norm": 1.8617004459073545,
"learning_rate": 1.1634345778946112e-05,
"loss": 1.032,
"mean_token_accuracy": 0.7186322927474975,
"num_tokens": 106854042.0,
"step": 905
},
{
"epoch": 0.5063995548135782,
"grad_norm": 1.9034875739816928,
"learning_rate": 1.1538433705660561e-05,
"loss": 1.0323,
"mean_token_accuracy": 0.7186863660812378,
"num_tokens": 107444483.0,
"step": 910
},
{
"epoch": 0.5091819699499165,
"grad_norm": 2.0209462331889156,
"learning_rate": 1.1442376456288402e-05,
"loss": 1.0378,
"mean_token_accuracy": 0.7178295731544495,
"num_tokens": 108034521.0,
"step": 915
},
{
"epoch": 0.5119643850862549,
"grad_norm": 1.9075532608687007,
"learning_rate": 1.1346183095383731e-05,
"loss": 1.0475,
"mean_token_accuracy": 0.7155048370361328,
"num_tokens": 108621638.0,
"step": 920
},
{
"epoch": 0.5147468002225932,
"grad_norm": 2.1926421106061165,
"learning_rate": 1.1249862700344969e-05,
"loss": 1.0305,
"mean_token_accuracy": 0.7172364115715026,
"num_tokens": 109218688.0,
"step": 925
},
{
"epoch": 0.5175292153589316,
"grad_norm": 1.9946226985852828,
"learning_rate": 1.1153424360558268e-05,
"loss": 1.0339,
"mean_token_accuracy": 0.716673743724823,
"num_tokens": 109808908.0,
"step": 930
},
{
"epoch": 0.5203116304952699,
"grad_norm": 1.9329841963207612,
"learning_rate": 1.1056877176539767e-05,
"loss": 1.0291,
"mean_token_accuracy": 0.7183609366416931,
"num_tokens": 110396483.0,
"step": 935
},
{
"epoch": 0.5230940456316082,
"grad_norm": 2.013170356946493,
"learning_rate": 1.0960230259076819e-05,
"loss": 1.0456,
"mean_token_accuracy": 0.7170237064361572,
"num_tokens": 110985830.0,
"step": 940
},
{
"epoch": 0.5258764607679466,
"grad_norm": 1.8999726413056113,
"learning_rate": 1.086349272836824e-05,
"loss": 1.0313,
"mean_token_accuracy": 0.7180803418159485,
"num_tokens": 111581446.0,
"step": 945
},
{
"epoch": 0.528658875904285,
"grad_norm": 2.0409502249425335,
"learning_rate": 1.0766673713163667e-05,
"loss": 1.0261,
"mean_token_accuracy": 0.717565405368805,
"num_tokens": 112170288.0,
"step": 950
},
{
"epoch": 0.5314412910406232,
"grad_norm": 1.885899964182058,
"learning_rate": 1.0669782349902122e-05,
"loss": 1.0363,
"mean_token_accuracy": 0.7165609478950501,
"num_tokens": 112758548.0,
"step": 955
},
{
"epoch": 0.5342237061769616,
"grad_norm": 1.8421326453167242,
"learning_rate": 1.0572827781849835e-05,
"loss": 1.0248,
"mean_token_accuracy": 0.7190156698226928,
"num_tokens": 113351756.0,
"step": 960
},
{
"epoch": 0.5370061213132999,
"grad_norm": 1.9626347976920178,
"learning_rate": 1.0475819158237426e-05,
"loss": 1.0484,
"mean_token_accuracy": 0.716618275642395,
"num_tokens": 113929618.0,
"step": 965
},
{
"epoch": 0.5397885364496383,
"grad_norm": 1.9260027594244913,
"learning_rate": 1.0378765633396526e-05,
"loss": 1.0122,
"mean_token_accuracy": 0.7219829797744751,
"num_tokens": 114511171.0,
"step": 970
},
{
"epoch": 0.5425709515859767,
"grad_norm": 2.01788882274496,
"learning_rate": 1.0281676365895939e-05,
"loss": 1.0341,
"mean_token_accuracy": 0.7173137307167053,
"num_tokens": 115100329.0,
"step": 975
},
{
"epoch": 0.5453533667223149,
"grad_norm": 2.0538865520683554,
"learning_rate": 1.0184560517677353e-05,
"loss": 1.0588,
"mean_token_accuracy": 0.715462589263916,
"num_tokens": 115692616.0,
"step": 980
},
{
"epoch": 0.5481357818586533,
"grad_norm": 1.981183937899365,
"learning_rate": 1.0087427253190775e-05,
"loss": 1.0287,
"mean_token_accuracy": 0.7187099099159241,
"num_tokens": 116282977.0,
"step": 985
},
{
"epoch": 0.5509181969949917,
"grad_norm": 1.9187211997367468,
"learning_rate": 9.990285738529733e-06,
"loss": 1.0103,
"mean_token_accuracy": 0.7224372506141663,
"num_tokens": 116867356.0,
"step": 990
},
{
"epoch": 0.55370061213133,
"grad_norm": 1.876324102948862,
"learning_rate": 9.89314514056627e-06,
"loss": 0.9724,
"mean_token_accuracy": 0.7300009608268738,
"num_tokens": 117453088.0,
"step": 995
},
{
"epoch": 0.5564830272676683,
"grad_norm": 1.9623303147959466,
"learning_rate": 9.79601462608595e-06,
"loss": 1.0035,
"mean_token_accuracy": 0.7247561693191529,
"num_tokens": 118045362.0,
"step": 1000
},
{
"epoch": 0.5592654424040067,
"grad_norm": 1.8686077447526968,
"learning_rate": 9.698903360922773e-06,
"loss": 0.9856,
"mean_token_accuracy": 0.7283125519752502,
"num_tokens": 118637830.0,
"step": 1005
},
{
"epoch": 0.562047857540345,
"grad_norm": 1.8357018834213918,
"learning_rate": 9.601820509094272e-06,
"loss": 1.0289,
"mean_token_accuracy": 0.7204028606414795,
"num_tokens": 119229320.0,
"step": 1010
},
{
"epoch": 0.5648302726766834,
"grad_norm": 2.0538909601653286,
"learning_rate": 9.504775231936716e-06,
"loss": 1.0498,
"mean_token_accuracy": 0.7141813278198242,
"num_tokens": 119821047.0,
"step": 1015
},
{
"epoch": 0.5676126878130217,
"grad_norm": 1.836766812630116,
"learning_rate": 9.407776687240591e-06,
"loss": 0.9964,
"mean_token_accuracy": 0.7254474043846131,
"num_tokens": 120416538.0,
"step": 1020
},
{
"epoch": 0.57039510294936,
"grad_norm": 1.9514390647117241,
"learning_rate": 9.310834028386436e-06,
"loss": 1.0173,
"mean_token_accuracy": 0.7226798415184021,
"num_tokens": 121007161.0,
"step": 1025
},
{
"epoch": 0.5731775180856984,
"grad_norm": 1.9399928393210226,
"learning_rate": 9.213956403481037e-06,
"loss": 1.0142,
"mean_token_accuracy": 0.7212600111961365,
"num_tokens": 121598725.0,
"step": 1030
},
{
"epoch": 0.5759599332220368,
"grad_norm": 1.9888989034840572,
"learning_rate": 9.117152954494195e-06,
"loss": 1.0328,
"mean_token_accuracy": 0.7186324715614318,
"num_tokens": 122188007.0,
"step": 1035
},
{
"epoch": 0.5787423483583751,
"grad_norm": 1.9479652836008858,
"learning_rate": 9.020432816395993e-06,
"loss": 1.0293,
"mean_token_accuracy": 0.7192287445068359,
"num_tokens": 122775444.0,
"step": 1040
},
{
"epoch": 0.5815247634947134,
"grad_norm": 1.868873876658722,
"learning_rate": 8.92380511629481e-06,
"loss": 1.0088,
"mean_token_accuracy": 0.7221626877784729,
"num_tokens": 123364954.0,
"step": 1045
},
{
"epoch": 0.5843071786310517,
"grad_norm": 1.8460354821562948,
"learning_rate": 8.827278972575984e-06,
"loss": 1.0034,
"mean_token_accuracy": 0.7228006601333619,
"num_tokens": 123959095.0,
"step": 1050
},
{
"epoch": 0.5870895937673901,
"grad_norm": 2.027141329127932,
"learning_rate": 8.730863494041379e-06,
"loss": 1.0222,
"mean_token_accuracy": 0.7207066774368286,
"num_tokens": 124551375.0,
"step": 1055
},
{
"epoch": 0.5898720089037285,
"grad_norm": 1.9303209158495678,
"learning_rate": 8.634567779049807e-06,
"loss": 1.0136,
"mean_token_accuracy": 0.7226389169692993,
"num_tokens": 125148583.0,
"step": 1060
},
{
"epoch": 0.5926544240400667,
"grad_norm": 1.821089900718507,
"learning_rate": 8.538400914658456e-06,
"loss": 1.0157,
"mean_token_accuracy": 0.7223539471626281,
"num_tokens": 125739350.0,
"step": 1065
},
{
"epoch": 0.5954368391764051,
"grad_norm": 1.9402132874471187,
"learning_rate": 8.442371975765368e-06,
"loss": 1.0255,
"mean_token_accuracy": 0.7197723150253296,
"num_tokens": 126327360.0,
"step": 1070
},
{
"epoch": 0.5982192543127435,
"grad_norm": 4.186389331333341,
"learning_rate": 8.346490024253103e-06,
"loss": 0.9985,
"mean_token_accuracy": 0.7242988467216491,
"num_tokens": 126919726.0,
"step": 1075
},
{
"epoch": 0.6010016694490818,
"grad_norm": 1.8722288044640503,
"learning_rate": 8.250764108133562e-06,
"loss": 1.018,
"mean_token_accuracy": 0.720951783657074,
"num_tokens": 127503663.0,
"step": 1080
},
{
"epoch": 0.6037840845854201,
"grad_norm": 1.9837555239288165,
"learning_rate": 8.15520326069421e-06,
"loss": 1.0133,
"mean_token_accuracy": 0.7221065282821655,
"num_tokens": 128092295.0,
"step": 1085
},
{
"epoch": 0.6065664997217585,
"grad_norm": 1.918332101066207,
"learning_rate": 8.05981649964559e-06,
"loss": 1.0336,
"mean_token_accuracy": 0.7192610502243042,
"num_tokens": 128681634.0,
"step": 1090
},
{
"epoch": 0.6093489148580968,
"grad_norm": 1.9154193774615031,
"learning_rate": 7.964612826270399e-06,
"loss": 0.9984,
"mean_token_accuracy": 0.7253150701522827,
"num_tokens": 129276945.0,
"step": 1095
},
{
"epoch": 0.6121313299944352,
"grad_norm": 1.9833171612274754,
"learning_rate": 7.86960122457404e-06,
"loss": 1.0098,
"mean_token_accuracy": 0.7225422620773315,
"num_tokens": 129870803.0,
"step": 1100
},
{
"epoch": 0.6149137451307735,
"grad_norm": 1.8771784670033254,
"learning_rate": 7.774790660436857e-06,
"loss": 1.0001,
"mean_token_accuracy": 0.7242280602455139,
"num_tokens": 130461085.0,
"step": 1105
},
{
"epoch": 0.6176961602671118,
"grad_norm": 1.98853932669659,
"learning_rate": 7.680190080768046e-06,
"loss": 1.006,
"mean_token_accuracy": 0.7234596967697143,
"num_tokens": 131044253.0,
"step": 1110
},
{
"epoch": 0.6204785754034502,
"grad_norm": 2.4167589390741946,
"learning_rate": 7.585808412661379e-06,
"loss": 1.0199,
"mean_token_accuracy": 0.7218466520309448,
"num_tokens": 131634142.0,
"step": 1115
},
{
"epoch": 0.6232609905397886,
"grad_norm": 1.9611913130221166,
"learning_rate": 7.4916545625527745e-06,
"loss": 1.011,
"mean_token_accuracy": 0.7240106225013733,
"num_tokens": 132217401.0,
"step": 1120
},
{
"epoch": 0.6260434056761269,
"grad_norm": 1.8664547397127202,
"learning_rate": 7.397737415379853e-06,
"loss": 1.0042,
"mean_token_accuracy": 0.7248145937919617,
"num_tokens": 132804730.0,
"step": 1125
},
{
"epoch": 0.6288258208124652,
"grad_norm": 1.8854684297024977,
"learning_rate": 7.304065833743475e-06,
"loss": 1.0112,
"mean_token_accuracy": 0.7220677971839905,
"num_tokens": 133395672.0,
"step": 1130
},
{
"epoch": 0.6316082359488036,
"grad_norm": 1.986610522875934,
"learning_rate": 7.210648657071433e-06,
"loss": 1.0152,
"mean_token_accuracy": 0.7226180791854858,
"num_tokens": 133987957.0,
"step": 1135
},
{
"epoch": 0.6343906510851419,
"grad_norm": 1.889771442380467,
"learning_rate": 7.117494700784292e-06,
"loss": 0.9915,
"mean_token_accuracy": 0.7284766793251037,
"num_tokens": 134580628.0,
"step": 1140
},
{
"epoch": 0.6371730662214803,
"grad_norm": 1.795583208948344,
"learning_rate": 7.024612755463529e-06,
"loss": 1.0106,
"mean_token_accuracy": 0.7225217223167419,
"num_tokens": 135175991.0,
"step": 1145
},
{
"epoch": 0.6399554813578185,
"grad_norm": 1.9767456426052468,
"learning_rate": 6.9320115860219705e-06,
"loss": 1.005,
"mean_token_accuracy": 0.724352490901947,
"num_tokens": 135760748.0,
"step": 1150
},
{
"epoch": 0.6427378964941569,
"grad_norm": 1.9522889123008382,
"learning_rate": 6.839699930876727e-06,
"loss": 1.0128,
"mean_token_accuracy": 0.7235522747039795,
"num_tokens": 136348202.0,
"step": 1155
},
{
"epoch": 0.6455203116304953,
"grad_norm": 1.8485871557042115,
"learning_rate": 6.747686501124531e-06,
"loss": 1.0202,
"mean_token_accuracy": 0.7193972945213318,
"num_tokens": 136939858.0,
"step": 1160
},
{
"epoch": 0.6483027267668336,
"grad_norm": 1.784737267536378,
"learning_rate": 6.655979979719744e-06,
"loss": 0.9938,
"mean_token_accuracy": 0.7244254350662231,
"num_tokens": 137528803.0,
"step": 1165
},
{
"epoch": 0.6510851419031719,
"grad_norm": 1.8881790480821496,
"learning_rate": 6.5645890206549566e-06,
"loss": 0.974,
"mean_token_accuracy": 0.7322948575019836,
"num_tokens": 138114576.0,
"step": 1170
},
{
"epoch": 0.6538675570395103,
"grad_norm": 1.941045031850279,
"learning_rate": 6.473522248144359e-06,
"loss": 0.9798,
"mean_token_accuracy": 0.7303597807884217,
"num_tokens": 138701429.0,
"step": 1175
},
{
"epoch": 0.6566499721758486,
"grad_norm": 1.9164315660360134,
"learning_rate": 6.382788255809893e-06,
"loss": 1.0005,
"mean_token_accuracy": 0.7247542023658753,
"num_tokens": 139296441.0,
"step": 1180
},
{
"epoch": 0.659432387312187,
"grad_norm": 1.9252747772897012,
"learning_rate": 6.292395605870314e-06,
"loss": 0.9935,
"mean_token_accuracy": 0.727207088470459,
"num_tokens": 139884765.0,
"step": 1185
},
{
"epoch": 0.6622148024485254,
"grad_norm": 2.48708952288793,
"learning_rate": 6.202352828333211e-06,
"loss": 0.997,
"mean_token_accuracy": 0.7260267257690429,
"num_tokens": 140474793.0,
"step": 1190
},
{
"epoch": 0.6649972175848636,
"grad_norm": 1.8703391327630177,
"learning_rate": 6.112668420190042e-06,
"loss": 0.9826,
"mean_token_accuracy": 0.7283554911613465,
"num_tokens": 141064904.0,
"step": 1195
},
{
"epoch": 0.667779632721202,
"grad_norm": 1.9501052152673526,
"learning_rate": 6.023350844614344e-06,
"loss": 0.9763,
"mean_token_accuracy": 0.7310232162475586,
"num_tokens": 141649410.0,
"step": 1200
},
{
"epoch": 0.6705620478575404,
"grad_norm": 1.9782655835974774,
"learning_rate": 5.9344085301630425e-06,
"loss": 1.003,
"mean_token_accuracy": 0.723707640171051,
"num_tokens": 142239282.0,
"step": 1205
},
{
"epoch": 0.6733444629938787,
"grad_norm": 1.9354136592872768,
"learning_rate": 5.845849869981137e-06,
"loss": 1.0027,
"mean_token_accuracy": 0.7262308835983277,
"num_tokens": 142827280.0,
"step": 1210
},
{
"epoch": 0.676126878130217,
"grad_norm": 1.8791345693920576,
"learning_rate": 5.757683221009625e-06,
"loss": 0.9975,
"mean_token_accuracy": 0.7248466491699219,
"num_tokens": 143422429.0,
"step": 1215
},
{
"epoch": 0.6789092932665554,
"grad_norm": 1.8963640356605256,
"learning_rate": 5.669916903196931e-06,
"loss": 1.0014,
"mean_token_accuracy": 0.7251011848449707,
"num_tokens": 144009015.0,
"step": 1220
},
{
"epoch": 0.6816917084028937,
"grad_norm": 1.7618216747620736,
"learning_rate": 5.58255919871374e-06,
"loss": 0.9848,
"mean_token_accuracy": 0.7293275952339172,
"num_tokens": 144602634.0,
"step": 1225
},
{
"epoch": 0.6844741235392321,
"grad_norm": 1.8457721051471847,
"learning_rate": 5.495618351171484e-06,
"loss": 0.9919,
"mean_token_accuracy": 0.7272073984146118,
"num_tokens": 145196052.0,
"step": 1230
},
{
"epoch": 0.6872565386755703,
"grad_norm": 1.9450479369664486,
"learning_rate": 5.409102564844393e-06,
"loss": 0.9938,
"mean_token_accuracy": 0.7261118292808533,
"num_tokens": 145794135.0,
"step": 1235
},
{
"epoch": 0.6900389538119087,
"grad_norm": 1.8511319642398822,
"learning_rate": 5.323020003895307e-06,
"loss": 0.9484,
"mean_token_accuracy": 0.7359282970428467,
"num_tokens": 146384348.0,
"step": 1240
},
{
"epoch": 0.6928213689482471,
"grad_norm": 1.7695297382973636,
"learning_rate": 5.237378791605249e-06,
"loss": 0.9638,
"mean_token_accuracy": 0.7326830267906189,
"num_tokens": 146981119.0,
"step": 1245
},
{
"epoch": 0.6956037840845855,
"grad_norm": 1.9109931194680208,
"learning_rate": 5.152187009606864e-06,
"loss": 0.9878,
"mean_token_accuracy": 0.7266369104385376,
"num_tokens": 147573298.0,
"step": 1250
},
{
"epoch": 0.6983861992209237,
"grad_norm": 1.8838124813014612,
"learning_rate": 5.067452697121773e-06,
"loss": 1.0136,
"mean_token_accuracy": 0.7227142214775085,
"num_tokens": 148166017.0,
"step": 1255
},
{
"epoch": 0.7011686143572621,
"grad_norm": 1.8764858816220478,
"learning_rate": 4.98318385020197e-06,
"loss": 0.991,
"mean_token_accuracy": 0.7256438136100769,
"num_tokens": 148758203.0,
"step": 1260
},
{
"epoch": 0.7039510294936004,
"grad_norm": 1.7836760728163532,
"learning_rate": 4.8993884209752364e-06,
"loss": 0.9776,
"mean_token_accuracy": 0.728616988658905,
"num_tokens": 149343772.0,
"step": 1265
},
{
"epoch": 0.7067334446299388,
"grad_norm": 1.859071558512736,
"learning_rate": 4.81607431689475e-06,
"loss": 0.9859,
"mean_token_accuracy": 0.7275610089302063,
"num_tokens": 149934534.0,
"step": 1270
},
{
"epoch": 0.7095158597662772,
"grad_norm": 1.830584410956007,
"learning_rate": 4.7332493999928785e-06,
"loss": 0.9997,
"mean_token_accuracy": 0.7258034944534302,
"num_tokens": 150528868.0,
"step": 1275
},
{
"epoch": 0.7122982749026154,
"grad_norm": 1.7872234444186852,
"learning_rate": 4.6509214861392785e-06,
"loss": 0.9904,
"mean_token_accuracy": 0.7283051371574402,
"num_tokens": 151128370.0,
"step": 1280
},
{
"epoch": 0.7150806900389538,
"grad_norm": 1.8707290023032752,
"learning_rate": 4.569098344303319e-06,
"loss": 0.9715,
"mean_token_accuracy": 0.7312512874603272,
"num_tokens": 151722014.0,
"step": 1285
},
{
"epoch": 0.7178631051752922,
"grad_norm": 1.847939158663493,
"learning_rate": 4.487787695820991e-06,
"loss": 0.973,
"mean_token_accuracy": 0.7308701038360595,
"num_tokens": 152312667.0,
"step": 1290
},
{
"epoch": 0.7206455203116305,
"grad_norm": 1.7516029476618973,
"learning_rate": 4.406997213666236e-06,
"loss": 0.9661,
"mean_token_accuracy": 0.731387734413147,
"num_tokens": 152899175.0,
"step": 1295
},
{
"epoch": 0.7234279354479688,
"grad_norm": 1.8151365859715758,
"learning_rate": 4.326734521726905e-06,
"loss": 0.9563,
"mean_token_accuracy": 0.7346587657928467,
"num_tokens": 153488259.0,
"step": 1300
},
{
"epoch": 0.7262103505843072,
"grad_norm": 1.7688011856056145,
"learning_rate": 4.24700719408531e-06,
"loss": 0.975,
"mean_token_accuracy": 0.7301976919174195,
"num_tokens": 154076796.0,
"step": 1305
},
{
"epoch": 0.7289927657206455,
"grad_norm": 1.9580888383075816,
"learning_rate": 4.167822754303493e-06,
"loss": 0.9738,
"mean_token_accuracy": 0.7310252785682678,
"num_tokens": 154664728.0,
"step": 1310
},
{
"epoch": 0.7317751808569839,
"grad_norm": 1.999126181586082,
"learning_rate": 4.0891886747132356e-06,
"loss": 0.9824,
"mean_token_accuracy": 0.7299495816230774,
"num_tokens": 155254919.0,
"step": 1315
},
{
"epoch": 0.7345575959933222,
"grad_norm": 1.921167819518032,
"learning_rate": 4.011112375710958e-06,
"loss": 1.0045,
"mean_token_accuracy": 0.7263089060783386,
"num_tokens": 155842686.0,
"step": 1320
},
{
"epoch": 0.7373400111296605,
"grad_norm": 1.801354649641667,
"learning_rate": 3.933601225057446e-06,
"loss": 0.9541,
"mean_token_accuracy": 0.7346353769302368,
"num_tokens": 156428772.0,
"step": 1325
},
{
"epoch": 0.7401224262659989,
"grad_norm": 1.8122280326962623,
"learning_rate": 3.85666253718263e-06,
"loss": 0.9565,
"mean_token_accuracy": 0.7328558325767517,
"num_tokens": 157015422.0,
"step": 1330
},
{
"epoch": 0.7429048414023373,
"grad_norm": 1.839585045567707,
"learning_rate": 3.7803035724953007e-06,
"loss": 0.9652,
"mean_token_accuracy": 0.7333778142929077,
"num_tokens": 157603116.0,
"step": 1335
},
{
"epoch": 0.7456872565386756,
"grad_norm": 1.8056794531482636,
"learning_rate": 3.704531536698012e-06,
"loss": 0.9576,
"mean_token_accuracy": 0.7345310568809509,
"num_tokens": 158186881.0,
"step": 1340
},
{
"epoch": 0.7484696716750139,
"grad_norm": 1.9143591141126923,
"learning_rate": 3.6293535801070735e-06,
"loss": 0.9709,
"mean_token_accuracy": 0.7322964310646057,
"num_tokens": 158782774.0,
"step": 1345
},
{
"epoch": 0.7512520868113522,
"grad_norm": 1.777185833225788,
"learning_rate": 3.5547767969778355e-06,
"loss": 0.9892,
"mean_token_accuracy": 0.7279403567314148,
"num_tokens": 159372802.0,
"step": 1350
},
{
"epoch": 0.7540345019476906,
"grad_norm": 1.9535914643310681,
"learning_rate": 3.4808082248352058e-06,
"loss": 0.9802,
"mean_token_accuracy": 0.7304705739021301,
"num_tokens": 159960156.0,
"step": 1355
},
{
"epoch": 0.756816917084029,
"grad_norm": 1.8788792436096216,
"learning_rate": 3.40745484380956e-06,
"loss": 0.9821,
"mean_token_accuracy": 0.7290140271186829,
"num_tokens": 160545408.0,
"step": 1360
},
{
"epoch": 0.7595993322203672,
"grad_norm": 1.927958774997016,
"learning_rate": 3.3347235759780483e-06,
"loss": 0.9752,
"mean_token_accuracy": 0.731472396850586,
"num_tokens": 161134387.0,
"step": 1365
},
{
"epoch": 0.7623817473567056,
"grad_norm": 1.916500311041559,
"learning_rate": 3.262621284711376e-06,
"loss": 0.9846,
"mean_token_accuracy": 0.729660439491272,
"num_tokens": 161724072.0,
"step": 1370
},
{
"epoch": 0.765164162493044,
"grad_norm": 1.7923444423002466,
"learning_rate": 3.191154774026156e-06,
"loss": 0.9655,
"mean_token_accuracy": 0.7318884611129761,
"num_tokens": 162310073.0,
"step": 1375
},
{
"epoch": 0.7679465776293823,
"grad_norm": 1.7760349668951476,
"learning_rate": 3.1203307879428146e-06,
"loss": 0.9522,
"mean_token_accuracy": 0.7352138042449952,
"num_tokens": 162900817.0,
"step": 1380
},
{
"epoch": 0.7707289927657206,
"grad_norm": 1.9573185565221094,
"learning_rate": 3.0501560098492056e-06,
"loss": 0.9476,
"mean_token_accuracy": 0.7361976623535156,
"num_tokens": 163488071.0,
"step": 1385
},
{
"epoch": 0.773511407902059,
"grad_norm": 1.8472978388798753,
"learning_rate": 2.9806370618699142e-06,
"loss": 0.9599,
"mean_token_accuracy": 0.7325201988220215,
"num_tokens": 164076411.0,
"step": 1390
},
{
"epoch": 0.7762938230383973,
"grad_norm": 1.9039958120991338,
"learning_rate": 2.911780504241354e-06,
"loss": 0.955,
"mean_token_accuracy": 0.7342100620269776,
"num_tokens": 164665515.0,
"step": 1395
},
{
"epoch": 0.7790762381747357,
"grad_norm": 1.8467129585219402,
"learning_rate": 2.8435928346926945e-06,
"loss": 0.959,
"mean_token_accuracy": 0.7346114397048951,
"num_tokens": 165255953.0,
"step": 1400
},
{
"epoch": 0.781858653311074,
"grad_norm": 2.0658334017918474,
"learning_rate": 2.776080487832715e-06,
"loss": 0.961,
"mean_token_accuracy": 0.7332920074462891,
"num_tokens": 165838622.0,
"step": 1405
},
{
"epoch": 0.7846410684474123,
"grad_norm": 1.8352945141096326,
"learning_rate": 2.70924983454257e-06,
"loss": 0.9963,
"mean_token_accuracy": 0.7269340515136719,
"num_tokens": 166431603.0,
"step": 1410
},
{
"epoch": 0.7874234835837507,
"grad_norm": 1.7741661715418184,
"learning_rate": 2.6431071813746277e-06,
"loss": 0.9548,
"mean_token_accuracy": 0.7333246469497681,
"num_tokens": 167020566.0,
"step": 1415
},
{
"epoch": 0.7902058987200891,
"grad_norm": 1.7577119163466035,
"learning_rate": 2.5776587699573007e-06,
"loss": 0.9557,
"mean_token_accuracy": 0.7359763622283936,
"num_tokens": 167611779.0,
"step": 1420
},
{
"epoch": 0.7929883138564274,
"grad_norm": 1.8883403863945059,
"learning_rate": 2.512910776406089e-06,
"loss": 0.9714,
"mean_token_accuracy": 0.7312511920928955,
"num_tokens": 168191251.0,
"step": 1425
},
{
"epoch": 0.7957707289927657,
"grad_norm": 1.8710738581171973,
"learning_rate": 2.4488693107407335e-06,
"loss": 0.9731,
"mean_token_accuracy": 0.7300806879997254,
"num_tokens": 168782303.0,
"step": 1430
},
{
"epoch": 0.798553144129104,
"grad_norm": 1.810175185890339,
"learning_rate": 2.3855404163086558e-06,
"loss": 0.9595,
"mean_token_accuracy": 0.7339372992515564,
"num_tokens": 169372790.0,
"step": 1435
},
{
"epoch": 0.8013355592654424,
"grad_norm": 1.8716987813342199,
"learning_rate": 2.322930069214664e-06,
"loss": 0.9422,
"mean_token_accuracy": 0.7376594424247742,
"num_tokens": 169958372.0,
"step": 1440
},
{
"epoch": 0.8041179744017808,
"grad_norm": 1.8055779362732807,
"learning_rate": 2.2610441777570104e-06,
"loss": 0.9713,
"mean_token_accuracy": 0.7313427925109863,
"num_tokens": 170547568.0,
"step": 1445
},
{
"epoch": 0.806900389538119,
"grad_norm": 1.8191938807882237,
"learning_rate": 2.1998885818698434e-06,
"loss": 0.9395,
"mean_token_accuracy": 0.7381924271583558,
"num_tokens": 171132579.0,
"step": 1450
},
{
"epoch": 0.8096828046744574,
"grad_norm": 1.889036188183445,
"learning_rate": 2.1394690525721275e-06,
"loss": 0.9744,
"mean_token_accuracy": 0.7313727378845215,
"num_tokens": 171722385.0,
"step": 1455
},
{
"epoch": 0.8124652198107958,
"grad_norm": 1.8701292804037986,
"learning_rate": 2.079791291423039e-06,
"loss": 0.9786,
"mean_token_accuracy": 0.729002046585083,
"num_tokens": 172315922.0,
"step": 1460
},
{
"epoch": 0.8152476349471341,
"grad_norm": 1.8760160382405797,
"learning_rate": 2.0208609299839465e-06,
"loss": 0.9683,
"mean_token_accuracy": 0.7306602478027344,
"num_tokens": 172910228.0,
"step": 1465
},
{
"epoch": 0.8180300500834724,
"grad_norm": 2.129314039319898,
"learning_rate": 1.962683529286973e-06,
"loss": 0.9634,
"mean_token_accuracy": 0.7342103958129883,
"num_tokens": 173492796.0,
"step": 1470
},
{
"epoch": 0.8208124652198108,
"grad_norm": 1.786710036467001,
"learning_rate": 1.9052645793102277e-06,
"loss": 0.9646,
"mean_token_accuracy": 0.7335922002792359,
"num_tokens": 174076770.0,
"step": 1475
},
{
"epoch": 0.8235948803561491,
"grad_norm": 1.895632789397937,
"learning_rate": 1.8486094984597268e-06,
"loss": 1.0103,
"mean_token_accuracy": 0.723214328289032,
"num_tokens": 174666564.0,
"step": 1480
},
{
"epoch": 0.8263772954924875,
"grad_norm": 1.8125039443565851,
"learning_rate": 1.7927236330581e-06,
"loss": 0.9504,
"mean_token_accuracy": 0.7362190008163452,
"num_tokens": 175250981.0,
"step": 1485
},
{
"epoch": 0.8291597106288259,
"grad_norm": 1.722786992549355,
"learning_rate": 1.7376122568400533e-06,
"loss": 0.9499,
"mean_token_accuracy": 0.7359644174575806,
"num_tokens": 175846510.0,
"step": 1490
},
{
"epoch": 0.8319421257651641,
"grad_norm": 1.841603530171618,
"learning_rate": 1.6832805704547272e-06,
"loss": 0.9551,
"mean_token_accuracy": 0.7342963933944702,
"num_tokens": 176432352.0,
"step": 1495
},
{
"epoch": 0.8347245409015025,
"grad_norm": 1.9002675861540066,
"learning_rate": 1.6297337009749249e-06,
"loss": 0.9446,
"mean_token_accuracy": 0.7374125838279724,
"num_tokens": 177024825.0,
"step": 1500
},
{
"epoch": 0.8375069560378409,
"grad_norm": 1.809765975130373,
"learning_rate": 1.5769767014132885e-06,
"loss": 0.9544,
"mean_token_accuracy": 0.7355196237564087,
"num_tokens": 177612725.0,
"step": 1505
},
{
"epoch": 0.8402893711741792,
"grad_norm": 1.75809727968389,
"learning_rate": 1.5250145502454594e-06,
"loss": 0.9548,
"mean_token_accuracy": 0.7356468796730041,
"num_tokens": 178207999.0,
"step": 1510
},
{
"epoch": 0.8430717863105175,
"grad_norm": 1.9477117802608452,
"learning_rate": 1.473852150940297e-06,
"loss": 0.9501,
"mean_token_accuracy": 0.7353867173194886,
"num_tokens": 178792546.0,
"step": 1515
},
{
"epoch": 0.8458542014468559,
"grad_norm": 1.8675907748201204,
"learning_rate": 1.4234943314971328e-06,
"loss": 0.9472,
"mean_token_accuracy": 0.7378309011459351,
"num_tokens": 179380874.0,
"step": 1520
},
{
"epoch": 0.8486366165831942,
"grad_norm": 1.9839427192393002,
"learning_rate": 1.373945843990192e-06,
"loss": 0.9686,
"mean_token_accuracy": 0.7325679302215576,
"num_tokens": 179970205.0,
"step": 1525
},
{
"epoch": 0.8514190317195326,
"grad_norm": 1.8714168212459494,
"learning_rate": 1.3252113641201537e-06,
"loss": 0.9532,
"mean_token_accuracy": 0.7361051917076111,
"num_tokens": 180566757.0,
"step": 1530
},
{
"epoch": 0.8542014468558708,
"grad_norm": 2.088368062803997,
"learning_rate": 1.2772954907729074e-06,
"loss": 0.9185,
"mean_token_accuracy": 0.7416197896003723,
"num_tokens": 181156035.0,
"step": 1535
},
{
"epoch": 0.8569838619922092,
"grad_norm": 1.9320065667167445,
"learning_rate": 1.2302027455855969e-06,
"loss": 0.9452,
"mean_token_accuracy": 0.736557149887085,
"num_tokens": 181740790.0,
"step": 1540
},
{
"epoch": 0.8597662771285476,
"grad_norm": 1.8430967747183953,
"learning_rate": 1.1839375725199098e-06,
"loss": 0.9541,
"mean_token_accuracy": 0.7358271360397339,
"num_tokens": 182328713.0,
"step": 1545
},
{
"epoch": 0.862548692264886,
"grad_norm": 1.9004590415789702,
"learning_rate": 1.1385043374427341e-06,
"loss": 0.9663,
"mean_token_accuracy": 0.731933867931366,
"num_tokens": 182919104.0,
"step": 1550
},
{
"epoch": 0.8653311074012242,
"grad_norm": 1.738836286774547,
"learning_rate": 1.0939073277141598e-06,
"loss": 0.9462,
"mean_token_accuracy": 0.737112843990326,
"num_tokens": 183507931.0,
"step": 1555
},
{
"epoch": 0.8681135225375626,
"grad_norm": 1.929598189537243,
"learning_rate": 1.0501507517829012e-06,
"loss": 0.9662,
"mean_token_accuracy": 0.7332514524459839,
"num_tokens": 184093155.0,
"step": 1560
},
{
"epoch": 0.8708959376739009,
"grad_norm": 1.8032404204598453,
"learning_rate": 1.0072387387891535e-06,
"loss": 0.941,
"mean_token_accuracy": 0.7367923140525818,
"num_tokens": 184680915.0,
"step": 1565
},
{
"epoch": 0.8736783528102393,
"grad_norm": 1.8432734129518489,
"learning_rate": 9.65175338174954e-07,
"loss": 0.9615,
"mean_token_accuracy": 0.7350999474525451,
"num_tokens": 185272303.0,
"step": 1570
},
{
"epoch": 0.8764607679465777,
"grad_norm": 1.9082038654252018,
"learning_rate": 9.239645193020386e-07,
"loss": 0.969,
"mean_token_accuracy": 0.7324134349822998,
"num_tokens": 185865698.0,
"step": 1575
},
{
"epoch": 0.8792431830829159,
"grad_norm": 1.8191308709884229,
"learning_rate": 8.836101710772826e-07,
"loss": 0.9429,
"mean_token_accuracy": 0.7369024634361268,
"num_tokens": 186455776.0,
"step": 1580
},
{
"epoch": 0.8820255982192543,
"grad_norm": 1.8602027757002002,
"learning_rate": 8.441161015857092e-07,
"loss": 0.9621,
"mean_token_accuracy": 0.7330835700035095,
"num_tokens": 187049436.0,
"step": 1585
},
{
"epoch": 0.8848080133555927,
"grad_norm": 1.8629150091039206,
"learning_rate": 8.054860377311368e-07,
"loss": 0.9632,
"mean_token_accuracy": 0.7352335929870606,
"num_tokens": 187643221.0,
"step": 1590
},
{
"epoch": 0.887590428491931,
"grad_norm": 1.7199187404593725,
"learning_rate": 7.677236248844855e-07,
"loss": 0.9208,
"mean_token_accuracy": 0.7412317156791687,
"num_tokens": 188241695.0,
"step": 1595
},
{
"epoch": 0.8903728436282693,
"grad_norm": 1.9691877660185322,
"learning_rate": 7.308324265397837e-07,
"loss": 0.9454,
"mean_token_accuracy": 0.7370688557624817,
"num_tokens": 188836037.0,
"step": 1600
},
{
"epoch": 0.8931552587646077,
"grad_norm": 1.9408377445794653,
"learning_rate": 6.948159239778829e-07,
"loss": 0.9529,
"mean_token_accuracy": 0.7338770508766175,
"num_tokens": 189432028.0,
"step": 1605
},
{
"epoch": 0.895937673900946,
"grad_norm": 1.7165506466090106,
"learning_rate": 6.596775159379543e-07,
"loss": 0.9539,
"mean_token_accuracy": 0.7329376816749573,
"num_tokens": 190027436.0,
"step": 1610
},
{
"epoch": 0.8987200890372844,
"grad_norm": 1.8489886082758764,
"learning_rate": 6.254205182967566e-07,
"loss": 0.9827,
"mean_token_accuracy": 0.7286684274673462,
"num_tokens": 190619847.0,
"step": 1615
},
{
"epoch": 0.9015025041736227,
"grad_norm": 1.8671715601161172,
"learning_rate": 5.920481637557318e-07,
"loss": 0.9519,
"mean_token_accuracy": 0.7349419355392456,
"num_tokens": 191212681.0,
"step": 1620
},
{
"epoch": 0.904284919309961,
"grad_norm": 1.9345594767714265,
"learning_rate": 5.59563601535943e-07,
"loss": 0.9273,
"mean_token_accuracy": 0.7412004351615906,
"num_tokens": 191805794.0,
"step": 1625
},
{
"epoch": 0.9070673344462994,
"grad_norm": 1.8551965637564125,
"learning_rate": 5.279698970809011e-07,
"loss": 0.9414,
"mean_token_accuracy": 0.7379236817359924,
"num_tokens": 192402500.0,
"step": 1630
},
{
"epoch": 0.9098497495826378,
"grad_norm": 1.7354354682366766,
"learning_rate": 4.972700317672829e-07,
"loss": 0.9497,
"mean_token_accuracy": 0.7355363965034485,
"num_tokens": 192989888.0,
"step": 1635
},
{
"epoch": 0.9126321647189761,
"grad_norm": 1.853770597272773,
"learning_rate": 4.674669026236045e-07,
"loss": 0.9457,
"mean_token_accuracy": 0.7373492479324341,
"num_tokens": 193579779.0,
"step": 1640
},
{
"epoch": 0.9154145798553144,
"grad_norm": 1.8911816701117188,
"learning_rate": 4.385633220568186e-07,
"loss": 0.9575,
"mean_token_accuracy": 0.7329700469970704,
"num_tokens": 194172425.0,
"step": 1645
},
{
"epoch": 0.9181969949916527,
"grad_norm": 1.810081275671363,
"learning_rate": 4.1056201758693957e-07,
"loss": 0.9497,
"mean_token_accuracy": 0.7358971953392028,
"num_tokens": 194760384.0,
"step": 1650
},
{
"epoch": 0.9209794101279911,
"grad_norm": 1.8068796359631913,
"learning_rate": 3.834656315896379e-07,
"loss": 0.9349,
"mean_token_accuracy": 0.7383142828941345,
"num_tokens": 195354279.0,
"step": 1655
},
{
"epoch": 0.9237618252643295,
"grad_norm": 1.7877479581883122,
"learning_rate": 3.572767210469086e-07,
"loss": 0.9418,
"mean_token_accuracy": 0.7375067234039306,
"num_tokens": 195940416.0,
"step": 1660
},
{
"epoch": 0.9265442404006677,
"grad_norm": 1.8061020929645715,
"learning_rate": 3.319977573057642e-07,
"loss": 0.9361,
"mean_token_accuracy": 0.7379802227020263,
"num_tokens": 196528933.0,
"step": 1665
},
{
"epoch": 0.9293266555370061,
"grad_norm": 1.7940687494892404,
"learning_rate": 3.0763112584503264e-07,
"loss": 0.9441,
"mean_token_accuracy": 0.7372017502784729,
"num_tokens": 197115460.0,
"step": 1670
},
{
"epoch": 0.9321090706733445,
"grad_norm": 1.7726872041826656,
"learning_rate": 2.841791260502402e-07,
"loss": 0.9618,
"mean_token_accuracy": 0.7329637885093689,
"num_tokens": 197705685.0,
"step": 1675
},
{
"epoch": 0.9348914858096828,
"grad_norm": 2.1180868450006214,
"learning_rate": 2.6164397099663676e-07,
"loss": 0.9656,
"mean_token_accuracy": 0.7338227391242981,
"num_tokens": 198291195.0,
"step": 1680
},
{
"epoch": 0.9376739009460211,
"grad_norm": 1.875515474041982,
"learning_rate": 2.4002778724034447e-07,
"loss": 0.9543,
"mean_token_accuracy": 0.7351009726524353,
"num_tokens": 198877612.0,
"step": 1685
},
{
"epoch": 0.9404563160823595,
"grad_norm": 1.8136225504642058,
"learning_rate": 2.1933261461769772e-07,
"loss": 0.9181,
"mean_token_accuracy": 0.7414175033569336,
"num_tokens": 199453234.0,
"step": 1690
},
{
"epoch": 0.9432387312186978,
"grad_norm": 1.7835014898755626,
"learning_rate": 1.9956040605273784e-07,
"loss": 0.9749,
"mean_token_accuracy": 0.730805778503418,
"num_tokens": 200046406.0,
"step": 1695
},
{
"epoch": 0.9460211463550362,
"grad_norm": 1.7517370910691499,
"learning_rate": 1.8071302737293294e-07,
"loss": 0.9323,
"mean_token_accuracy": 0.7374993085861206,
"num_tokens": 200635906.0,
"step": 1700
},
{
"epoch": 0.9488035614913745,
"grad_norm": 1.8527935092393206,
"learning_rate": 1.6279225713310088e-07,
"loss": 0.9295,
"mean_token_accuracy": 0.7385197877883911,
"num_tokens": 201229877.0,
"step": 1705
},
{
"epoch": 0.9515859766277128,
"grad_norm": 1.8136557164407798,
"learning_rate": 1.4579978644757463e-07,
"loss": 0.9471,
"mean_token_accuracy": 0.7368571162223816,
"num_tokens": 201819996.0,
"step": 1710
},
{
"epoch": 0.9543683917640512,
"grad_norm": 1.9202219251983068,
"learning_rate": 1.297372188306234e-07,
"loss": 0.9695,
"mean_token_accuracy": 0.7313903212547302,
"num_tokens": 202413015.0,
"step": 1715
},
{
"epoch": 0.9571508069003896,
"grad_norm": 1.9683140510809376,
"learning_rate": 1.1460607004512681e-07,
"loss": 0.9575,
"mean_token_accuracy": 0.734274709224701,
"num_tokens": 203004551.0,
"step": 1720
},
{
"epoch": 0.9599332220367279,
"grad_norm": 1.9014384654161847,
"learning_rate": 1.004077679595472e-07,
"loss": 0.9535,
"mean_token_accuracy": 0.7355141043663025,
"num_tokens": 203598341.0,
"step": 1725
},
{
"epoch": 0.9627156371730662,
"grad_norm": 1.9286046696281232,
"learning_rate": 8.714365241318079e-08,
"loss": 0.9554,
"mean_token_accuracy": 0.7344950199127197,
"num_tokens": 204184977.0,
"step": 1730
},
{
"epoch": 0.9654980523094046,
"grad_norm": 1.848637459602703,
"learning_rate": 7.481497508972313e-08,
"loss": 0.9495,
"mean_token_accuracy": 0.7371500611305237,
"num_tokens": 204771217.0,
"step": 1735
},
{
"epoch": 0.9682804674457429,
"grad_norm": 1.8523905866159716,
"learning_rate": 6.342289939915369e-08,
"loss": 0.9586,
"mean_token_accuracy": 0.7342531204223632,
"num_tokens": 205355942.0,
"step": 1740
},
{
"epoch": 0.9710628825820813,
"grad_norm": 1.7373507952790055,
"learning_rate": 5.2968500367951425e-08,
"loss": 0.9239,
"mean_token_accuracy": 0.7412109613418579,
"num_tokens": 205948315.0,
"step": 1745
},
{
"epoch": 0.9738452977184195,
"grad_norm": 1.7596019981079423,
"learning_rate": 4.345276453764258e-08,
"loss": 0.9212,
"mean_token_accuracy": 0.7406978607177734,
"num_tokens": 206543533.0,
"step": 1750
},
{
"epoch": 0.9766277128547579,
"grad_norm": 1.9410970916478685,
"learning_rate": 3.487658987171294e-08,
"loss": 0.9673,
"mean_token_accuracy": 0.732921814918518,
"num_tokens": 207131317.0,
"step": 1755
},
{
"epoch": 0.9794101279910963,
"grad_norm": 1.8954966093705055,
"learning_rate": 2.724078567086119e-08,
"loss": 0.961,
"mean_token_accuracy": 0.733088493347168,
"num_tokens": 207715056.0,
"step": 1760
},
{
"epoch": 0.9821925431274346,
"grad_norm": 1.9018642175991762,
"learning_rate": 2.054607249663665e-08,
"loss": 0.9794,
"mean_token_accuracy": 0.7320362687110901,
"num_tokens": 208303816.0,
"step": 1765
},
{
"epoch": 0.9849749582637729,
"grad_norm": 1.7650786536070042,
"learning_rate": 1.4793082103435885e-08,
"loss": 0.9314,
"mean_token_accuracy": 0.739205515384674,
"num_tokens": 208893950.0,
"step": 1770
},
{
"epoch": 0.9877573734001113,
"grad_norm": 1.7266587239672113,
"learning_rate": 9.982357378891528e-09,
"loss": 0.9607,
"mean_token_accuracy": 0.7332687497138977,
"num_tokens": 209482604.0,
"step": 1775
},
{
"epoch": 0.9905397885364496,
"grad_norm": 1.8691016007630499,
"learning_rate": 6.114352292639902e-09,
"loss": 0.9559,
"mean_token_accuracy": 0.734725546836853,
"num_tokens": 210067242.0,
"step": 1780
},
{
"epoch": 0.993322203672788,
"grad_norm": 1.6438880148449324,
"learning_rate": 3.1894318534819725e-09,
"loss": 0.9707,
"mean_token_accuracy": 0.7310842633247375,
"num_tokens": 210660656.0,
"step": 1785
},
{
"epoch": 0.9961046188091264,
"grad_norm": 1.9330784206590557,
"learning_rate": 1.2078720749364447e-09,
"loss": 0.9491,
"mean_token_accuracy": 0.7360571622848511,
"num_tokens": 211243204.0,
"step": 1790
},
{
"epoch": 0.9988870339454646,
"grad_norm": 1.8314428771696998,
"learning_rate": 1.69859949198381e-10,
"loss": 0.9316,
"mean_token_accuracy": 0.7387963652610778,
"num_tokens": 211840329.0,
"step": 1795
},
{
"epoch": 1.0,
"eval_loss": 0.9461386799812317,
"eval_mean_token_accuracy": 0.7361341528594494,
"eval_num_tokens": 212075689.0,
"eval_runtime": 4.8216,
"eval_samples_per_second": 207.398,
"eval_steps_per_second": 3.318,
"step": 1797
},
{
"epoch": 1.0,
"step": 1797,
"total_flos": 376255241256960.0,
"train_loss": 1.0248491220097444,
"train_runtime": 4390.0943,
"train_samples_per_second": 52.385,
"train_steps_per_second": 0.409
}
],
"logging_steps": 5,
"max_steps": 1797,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 376255241256960.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}