{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1797, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0027824151363383415, "grad_norm": 11.703799339759724, "learning_rate": 4.444444444444445e-07, "loss": 1.1659, "mean_token_accuracy": 0.6976747632026672, "num_tokens": 585246.0, "step": 5 }, { "epoch": 0.005564830272676683, "grad_norm": 5.602071651449005, "learning_rate": 1.0000000000000002e-06, "loss": 1.0864, "mean_token_accuracy": 0.7083896398544312, "num_tokens": 1171311.0, "step": 10 }, { "epoch": 0.008347245409015025, "grad_norm": 3.1420241423044755, "learning_rate": 1.5555555555555558e-06, "loss": 1.0324, "mean_token_accuracy": 0.7197834849357605, "num_tokens": 1757884.0, "step": 15 }, { "epoch": 0.011129660545353366, "grad_norm": 2.934538436088403, "learning_rate": 2.1111111111111114e-06, "loss": 1.0212, "mean_token_accuracy": 0.7205601453781127, "num_tokens": 2351106.0, "step": 20 }, { "epoch": 0.013912075681691708, "grad_norm": 2.7899043139645996, "learning_rate": 2.666666666666667e-06, "loss": 0.9897, "mean_token_accuracy": 0.7282361865043641, "num_tokens": 2942453.0, "step": 25 }, { "epoch": 0.01669449081803005, "grad_norm": 2.618414182725373, "learning_rate": 3.2222222222222227e-06, "loss": 0.9967, "mean_token_accuracy": 0.7268516778945923, "num_tokens": 3524350.0, "step": 30 }, { "epoch": 0.019476905954368393, "grad_norm": 3.312970663728368, "learning_rate": 3.777777777777778e-06, "loss": 1.0133, "mean_token_accuracy": 0.7244957327842713, "num_tokens": 4116171.0, "step": 35 }, { "epoch": 0.022259321090706732, "grad_norm": 2.5606592497391945, "learning_rate": 4.333333333333334e-06, "loss": 1.0046, "mean_token_accuracy": 0.724289059638977, "num_tokens": 4703551.0, "step": 40 }, { "epoch": 0.025041736227045076, "grad_norm": 3.539575821864669, "learning_rate": 4.888888888888889e-06, "loss": 1.0161, "mean_token_accuracy": 0.7216586589813232, "num_tokens": 5283285.0, "step": 45 }, { "epoch": 0.027824151363383415, "grad_norm": 2.9309023782989936, "learning_rate": 5.444444444444445e-06, "loss": 1.0203, "mean_token_accuracy": 0.7215822339057922, "num_tokens": 5876874.0, "step": 50 }, { "epoch": 0.03060656649972176, "grad_norm": 3.284413853809602, "learning_rate": 6e-06, "loss": 0.9989, "mean_token_accuracy": 0.7253866314888, "num_tokens": 6470253.0, "step": 55 }, { "epoch": 0.0333889816360601, "grad_norm": 3.035514802811082, "learning_rate": 6.555555555555556e-06, "loss": 0.9791, "mean_token_accuracy": 0.7286934494972229, "num_tokens": 7056051.0, "step": 60 }, { "epoch": 0.036171396772398445, "grad_norm": 2.8291474051822996, "learning_rate": 7.111111111111112e-06, "loss": 0.9859, "mean_token_accuracy": 0.7266968131065369, "num_tokens": 7640554.0, "step": 65 }, { "epoch": 0.038953811908736785, "grad_norm": 2.76432791315742, "learning_rate": 7.666666666666667e-06, "loss": 1.0127, "mean_token_accuracy": 0.7222738623619079, "num_tokens": 8235176.0, "step": 70 }, { "epoch": 0.041736227045075125, "grad_norm": 3.0121797507858985, "learning_rate": 8.222222222222222e-06, "loss": 1.033, "mean_token_accuracy": 0.7161361455917359, "num_tokens": 8823675.0, "step": 75 }, { "epoch": 0.044518642181413465, "grad_norm": 2.6579939949703095, "learning_rate": 8.777777777777778e-06, "loss": 1.0106, "mean_token_accuracy": 0.722004747390747, "num_tokens": 9419065.0, "step": 80 }, { "epoch": 0.04730105731775181, "grad_norm": 2.756043313002429, "learning_rate": 9.333333333333334e-06, "loss": 1.0284, "mean_token_accuracy": 0.7184515237808228, "num_tokens": 10018707.0, "step": 85 }, { "epoch": 0.05008347245409015, "grad_norm": 2.7330649459439487, "learning_rate": 9.88888888888889e-06, "loss": 1.0381, "mean_token_accuracy": 0.716748857498169, "num_tokens": 10615769.0, "step": 90 }, { "epoch": 0.05286588759042849, "grad_norm": 3.119124532731729, "learning_rate": 1.0444444444444445e-05, "loss": 1.0159, "mean_token_accuracy": 0.7196141958236695, "num_tokens": 11206791.0, "step": 95 }, { "epoch": 0.05564830272676683, "grad_norm": 2.6869099855331524, "learning_rate": 1.1000000000000001e-05, "loss": 1.0259, "mean_token_accuracy": 0.7180516362190247, "num_tokens": 11794887.0, "step": 100 }, { "epoch": 0.05843071786310518, "grad_norm": 2.6746654996398225, "learning_rate": 1.1555555555555556e-05, "loss": 1.0068, "mean_token_accuracy": 0.7229955673217774, "num_tokens": 12385737.0, "step": 105 }, { "epoch": 0.06121313299944352, "grad_norm": 2.9016124547228124, "learning_rate": 1.211111111111111e-05, "loss": 1.0407, "mean_token_accuracy": 0.7161330699920654, "num_tokens": 12974642.0, "step": 110 }, { "epoch": 0.06399554813578186, "grad_norm": 2.7590693104720887, "learning_rate": 1.2666666666666667e-05, "loss": 1.034, "mean_token_accuracy": 0.7153326869010925, "num_tokens": 13580517.0, "step": 115 }, { "epoch": 0.0667779632721202, "grad_norm": 3.0191169781226734, "learning_rate": 1.3222222222222223e-05, "loss": 1.0218, "mean_token_accuracy": 0.7199317574501037, "num_tokens": 14169869.0, "step": 120 }, { "epoch": 0.06956037840845854, "grad_norm": 2.673488197212258, "learning_rate": 1.377777777777778e-05, "loss": 1.0589, "mean_token_accuracy": 0.7112971425056458, "num_tokens": 14758276.0, "step": 125 }, { "epoch": 0.07234279354479689, "grad_norm": 2.7094276528268795, "learning_rate": 1.4333333333333334e-05, "loss": 1.049, "mean_token_accuracy": 0.712811267375946, "num_tokens": 15345599.0, "step": 130 }, { "epoch": 0.07512520868113523, "grad_norm": 2.6373785827567957, "learning_rate": 1.488888888888889e-05, "loss": 1.0225, "mean_token_accuracy": 0.7185760021209717, "num_tokens": 15932693.0, "step": 135 }, { "epoch": 0.07790762381747357, "grad_norm": 2.9632223301755154, "learning_rate": 1.5444444444444446e-05, "loss": 1.0654, "mean_token_accuracy": 0.7110099554061889, "num_tokens": 16520098.0, "step": 140 }, { "epoch": 0.08069003895381191, "grad_norm": 2.817380088138206, "learning_rate": 1.6000000000000003e-05, "loss": 1.0674, "mean_token_accuracy": 0.709651243686676, "num_tokens": 17110470.0, "step": 145 }, { "epoch": 0.08347245409015025, "grad_norm": 2.6945752947019073, "learning_rate": 1.6555555555555556e-05, "loss": 1.054, "mean_token_accuracy": 0.7122701048851013, "num_tokens": 17696206.0, "step": 150 }, { "epoch": 0.08625486922648859, "grad_norm": 2.397611832219796, "learning_rate": 1.7111111111111112e-05, "loss": 1.0735, "mean_token_accuracy": 0.709396231174469, "num_tokens": 18292078.0, "step": 155 }, { "epoch": 0.08903728436282693, "grad_norm": 2.7655354856899517, "learning_rate": 1.7666666666666668e-05, "loss": 1.0809, "mean_token_accuracy": 0.7083318114280701, "num_tokens": 18888122.0, "step": 160 }, { "epoch": 0.09181969949916527, "grad_norm": 2.758607000704, "learning_rate": 1.8222222222222224e-05, "loss": 1.0822, "mean_token_accuracy": 0.706460428237915, "num_tokens": 19484865.0, "step": 165 }, { "epoch": 0.09460211463550362, "grad_norm": 2.570904335556245, "learning_rate": 1.877777777777778e-05, "loss": 1.0826, "mean_token_accuracy": 0.7078906774520874, "num_tokens": 20071044.0, "step": 170 }, { "epoch": 0.09738452977184196, "grad_norm": 2.735490789384101, "learning_rate": 1.9333333333333333e-05, "loss": 1.0858, "mean_token_accuracy": 0.7050360441207886, "num_tokens": 20662778.0, "step": 175 }, { "epoch": 0.1001669449081803, "grad_norm": 2.7739191020812655, "learning_rate": 1.988888888888889e-05, "loss": 1.0681, "mean_token_accuracy": 0.7093758583068848, "num_tokens": 21253812.0, "step": 180 }, { "epoch": 0.10294936004451864, "grad_norm": 2.8028592827713363, "learning_rate": 1.9999698027421894e-05, "loss": 1.0702, "mean_token_accuracy": 0.7094247817993165, "num_tokens": 21843322.0, "step": 185 }, { "epoch": 0.10573177518085698, "grad_norm": 2.595713944513709, "learning_rate": 1.9998471295079908e-05, "loss": 1.0458, "mean_token_accuracy": 0.7138230800628662, "num_tokens": 22433061.0, "step": 190 }, { "epoch": 0.10851419031719532, "grad_norm": 2.7873986833719946, "learning_rate": 1.9996301045360874e-05, "loss": 1.0974, "mean_token_accuracy": 0.704916775226593, "num_tokens": 23021153.0, "step": 195 }, { "epoch": 0.11129660545353366, "grad_norm": 2.439806161655261, "learning_rate": 1.9993187483062935e-05, "loss": 1.0771, "mean_token_accuracy": 0.7064628720283508, "num_tokens": 23609275.0, "step": 200 }, { "epoch": 0.11407902058987202, "grad_norm": 2.4957981823187763, "learning_rate": 1.9989130902001025e-05, "loss": 1.0917, "mean_token_accuracy": 0.7053624391555786, "num_tokens": 24205073.0, "step": 205 }, { "epoch": 0.11686143572621036, "grad_norm": 2.896823560689145, "learning_rate": 1.9984131684979134e-05, "loss": 1.1004, "mean_token_accuracy": 0.7049420475959778, "num_tokens": 24800240.0, "step": 210 }, { "epoch": 0.1196438508625487, "grad_norm": 2.5192627022618623, "learning_rate": 1.997819030375419e-05, "loss": 1.0623, "mean_token_accuracy": 0.7119413614273071, "num_tokens": 25387549.0, "step": 215 }, { "epoch": 0.12242626599888703, "grad_norm": 2.8132540932895695, "learning_rate": 1.9971307318991546e-05, "loss": 1.0915, "mean_token_accuracy": 0.7074636220932007, "num_tokens": 25965133.0, "step": 220 }, { "epoch": 0.12520868113522537, "grad_norm": 35.333375177867765, "learning_rate": 1.996348338021207e-05, "loss": 1.114, "mean_token_accuracy": 0.700543737411499, "num_tokens": 26563977.0, "step": 225 }, { "epoch": 0.12799109627156371, "grad_norm": 2.920196605955216, "learning_rate": 1.9954719225730847e-05, "loss": 1.1139, "mean_token_accuracy": 0.7011779904365539, "num_tokens": 27156932.0, "step": 230 }, { "epoch": 0.13077351140790205, "grad_norm": 2.60574074107156, "learning_rate": 1.9945015682587512e-05, "loss": 1.096, "mean_token_accuracy": 0.7043320059776306, "num_tokens": 27754019.0, "step": 235 }, { "epoch": 0.1335559265442404, "grad_norm": 2.865631892072952, "learning_rate": 1.9934373666468203e-05, "loss": 1.0804, "mean_token_accuracy": 0.706881308555603, "num_tokens": 28342275.0, "step": 240 }, { "epoch": 0.13633834168057873, "grad_norm": 2.504324070572632, "learning_rate": 1.992279418161915e-05, "loss": 1.099, "mean_token_accuracy": 0.7036979913711547, "num_tokens": 28928534.0, "step": 245 }, { "epoch": 0.13912075681691707, "grad_norm": 2.6138293066655045, "learning_rate": 1.991027832075192e-05, "loss": 1.0921, "mean_token_accuracy": 0.7047542929649353, "num_tokens": 29513990.0, "step": 250 }, { "epoch": 0.1419031719532554, "grad_norm": 2.3711537074020037, "learning_rate": 1.989682726494028e-05, "loss": 1.0562, "mean_token_accuracy": 0.7139820337295533, "num_tokens": 30113881.0, "step": 255 }, { "epoch": 0.14468558708959378, "grad_norm": 2.338233428322228, "learning_rate": 1.988244228350877e-05, "loss": 1.0811, "mean_token_accuracy": 0.7071714401245117, "num_tokens": 30700467.0, "step": 260 }, { "epoch": 0.14746800222593212, "grad_norm": 2.5175646612189655, "learning_rate": 1.986712473391289e-05, "loss": 1.0979, "mean_token_accuracy": 0.7044062852859497, "num_tokens": 31292719.0, "step": 265 }, { "epoch": 0.15025041736227046, "grad_norm": 2.40596554930329, "learning_rate": 1.9850876061611036e-05, "loss": 1.092, "mean_token_accuracy": 0.706499433517456, "num_tokens": 31883435.0, "step": 270 }, { "epoch": 0.1530328324986088, "grad_norm": 2.4796442221955384, "learning_rate": 1.9833697799928074e-05, "loss": 1.0967, "mean_token_accuracy": 0.7027202010154724, "num_tokens": 32481693.0, "step": 275 }, { "epoch": 0.15581524763494714, "grad_norm": 2.4780125377517663, "learning_rate": 1.9815591569910654e-05, "loss": 1.1121, "mean_token_accuracy": 0.7006395697593689, "num_tokens": 33071205.0, "step": 280 }, { "epoch": 0.15859766277128548, "grad_norm": 2.3510385258823976, "learning_rate": 1.979655908017424e-05, "loss": 1.0861, "mean_token_accuracy": 0.7057282090187073, "num_tokens": 33671052.0, "step": 285 }, { "epoch": 0.16138007790762382, "grad_norm": 2.5067301813131655, "learning_rate": 1.9776602126741867e-05, "loss": 1.0807, "mean_token_accuracy": 0.7070404767990113, "num_tokens": 34260518.0, "step": 290 }, { "epoch": 0.16416249304396216, "grad_norm": 2.28351268206912, "learning_rate": 1.975572259287467e-05, "loss": 1.0803, "mean_token_accuracy": 0.7072898864746093, "num_tokens": 34848727.0, "step": 295 }, { "epoch": 0.1669449081803005, "grad_norm": 2.7544633183153926, "learning_rate": 1.973392244889415e-05, "loss": 1.0854, "mean_token_accuracy": 0.7059407949447631, "num_tokens": 35437641.0, "step": 300 }, { "epoch": 0.16972732331663884, "grad_norm": 2.3830000818389925, "learning_rate": 1.9711203751996267e-05, "loss": 1.0988, "mean_token_accuracy": 0.7048187136650086, "num_tokens": 36037300.0, "step": 305 }, { "epoch": 0.17250973845297718, "grad_norm": 2.360850123530632, "learning_rate": 1.9687568646057277e-05, "loss": 1.0736, "mean_token_accuracy": 0.7092967867851258, "num_tokens": 36629457.0, "step": 310 }, { "epoch": 0.17529215358931552, "grad_norm": 2.2707504412701582, "learning_rate": 1.966301936143146e-05, "loss": 1.0958, "mean_token_accuracy": 0.7042588949203491, "num_tokens": 37224522.0, "step": 315 }, { "epoch": 0.17807456872565386, "grad_norm": 2.201925130311423, "learning_rate": 1.9637558214740618e-05, "loss": 1.0964, "mean_token_accuracy": 0.7053308248519897, "num_tokens": 37815851.0, "step": 320 }, { "epoch": 0.1808569838619922, "grad_norm": 2.2178593995634697, "learning_rate": 1.9611187608655484e-05, "loss": 1.1105, "mean_token_accuracy": 0.7033011674880981, "num_tokens": 38396495.0, "step": 325 }, { "epoch": 0.18363939899833054, "grad_norm": 2.2136118629553185, "learning_rate": 1.9583910031668984e-05, "loss": 1.0862, "mean_token_accuracy": 0.7051831126213074, "num_tokens": 38994173.0, "step": 330 }, { "epoch": 0.1864218141346689, "grad_norm": 2.292573884422775, "learning_rate": 1.955572805786141e-05, "loss": 1.1233, "mean_token_accuracy": 0.7006201148033142, "num_tokens": 39577921.0, "step": 335 }, { "epoch": 0.18920422927100725, "grad_norm": 2.2980324001779873, "learning_rate": 1.9526644346657508e-05, "loss": 1.1007, "mean_token_accuracy": 0.7046499371528625, "num_tokens": 40170313.0, "step": 340 }, { "epoch": 0.19198664440734559, "grad_norm": 2.2611492444425916, "learning_rate": 1.9496661642575517e-05, "loss": 1.065, "mean_token_accuracy": 0.7105947017669678, "num_tokens": 40765429.0, "step": 345 }, { "epoch": 0.19476905954368393, "grad_norm": 2.1512493790533185, "learning_rate": 1.946578277496821e-05, "loss": 1.0917, "mean_token_accuracy": 0.7073456883430481, "num_tokens": 41365863.0, "step": 350 }, { "epoch": 0.19755147468002227, "grad_norm": 2.271847848176885, "learning_rate": 1.943401065775584e-05, "loss": 1.1011, "mean_token_accuracy": 0.7054303050041199, "num_tokens": 41957368.0, "step": 355 }, { "epoch": 0.2003338898163606, "grad_norm": 2.222128508786771, "learning_rate": 1.940134828915123e-05, "loss": 1.1086, "mean_token_accuracy": 0.7034181118011474, "num_tokens": 42546312.0, "step": 360 }, { "epoch": 0.20311630495269895, "grad_norm": 2.159135050960954, "learning_rate": 1.936779875137678e-05, "loss": 1.0821, "mean_token_accuracy": 0.707557737827301, "num_tokens": 43141094.0, "step": 365 }, { "epoch": 0.20589872008903728, "grad_norm": 2.2055966042491546, "learning_rate": 1.9333365210373668e-05, "loss": 1.0902, "mean_token_accuracy": 0.705539345741272, "num_tokens": 43731225.0, "step": 370 }, { "epoch": 0.20868113522537562, "grad_norm": 2.1858790427306785, "learning_rate": 1.9298050915503053e-05, "loss": 1.1066, "mean_token_accuracy": 0.7038124799728394, "num_tokens": 44311286.0, "step": 375 }, { "epoch": 0.21146355036171396, "grad_norm": 2.204033703125247, "learning_rate": 1.926185919923946e-05, "loss": 1.0971, "mean_token_accuracy": 0.7054288148880005, "num_tokens": 44906755.0, "step": 380 }, { "epoch": 0.2142459654980523, "grad_norm": 2.2598831388779694, "learning_rate": 1.9224793476856293e-05, "loss": 1.1201, "mean_token_accuracy": 0.700083589553833, "num_tokens": 45491095.0, "step": 385 }, { "epoch": 0.21702838063439064, "grad_norm": 2.1902124071852977, "learning_rate": 1.9186857246103586e-05, "loss": 1.079, "mean_token_accuracy": 0.7079141974449158, "num_tokens": 46084794.0, "step": 390 }, { "epoch": 0.21981079577072898, "grad_norm": 2.225166806223554, "learning_rate": 1.9148054086877884e-05, "loss": 1.0965, "mean_token_accuracy": 0.7044667720794677, "num_tokens": 46674587.0, "step": 395 }, { "epoch": 0.22259321090706732, "grad_norm": 2.110266160830183, "learning_rate": 1.9108387660884456e-05, "loss": 1.1019, "mean_token_accuracy": 0.7042423367500306, "num_tokens": 47263613.0, "step": 400 }, { "epoch": 0.22537562604340566, "grad_norm": 2.137008460009227, "learning_rate": 1.9067861711291744e-05, "loss": 1.0984, "mean_token_accuracy": 0.7045777201652527, "num_tokens": 47848405.0, "step": 405 }, { "epoch": 0.22815804117974403, "grad_norm": 2.5090054849409573, "learning_rate": 1.9026480062378136e-05, "loss": 1.1232, "mean_token_accuracy": 0.7006029844284057, "num_tokens": 48440420.0, "step": 410 }, { "epoch": 0.23094045631608237, "grad_norm": 2.5298900029195934, "learning_rate": 1.8984246619171075e-05, "loss": 1.0998, "mean_token_accuracy": 0.7040575265884399, "num_tokens": 49026577.0, "step": 415 }, { "epoch": 0.2337228714524207, "grad_norm": 2.243788572378369, "learning_rate": 1.894116536707857e-05, "loss": 1.0931, "mean_token_accuracy": 0.7059786558151245, "num_tokens": 49618303.0, "step": 420 }, { "epoch": 0.23650528658875905, "grad_norm": 2.2973971910882853, "learning_rate": 1.8897240371513098e-05, "loss": 1.1076, "mean_token_accuracy": 0.7032187581062317, "num_tokens": 50211716.0, "step": 425 }, { "epoch": 0.2392877017250974, "grad_norm": 2.121159415163043, "learning_rate": 1.8852475777507983e-05, "loss": 1.0882, "mean_token_accuracy": 0.7079625129699707, "num_tokens": 50806268.0, "step": 430 }, { "epoch": 0.24207011686143573, "grad_norm": 2.2653088399729593, "learning_rate": 1.8806875809326204e-05, "loss": 1.0988, "mean_token_accuracy": 0.7044902324676514, "num_tokens": 51395551.0, "step": 435 }, { "epoch": 0.24485253199777407, "grad_norm": 2.0640694310319647, "learning_rate": 1.876044477006183e-05, "loss": 1.1057, "mean_token_accuracy": 0.7019346117973327, "num_tokens": 51988430.0, "step": 440 }, { "epoch": 0.2476349471341124, "grad_norm": 2.1331480343408225, "learning_rate": 1.8713187041233896e-05, "loss": 1.0845, "mean_token_accuracy": 0.7060743689537048, "num_tokens": 52583147.0, "step": 445 }, { "epoch": 0.25041736227045075, "grad_norm": 2.4013534842444186, "learning_rate": 1.866510708237297e-05, "loss": 1.0979, "mean_token_accuracy": 0.7047066450119018, "num_tokens": 53181352.0, "step": 450 }, { "epoch": 0.2531997774067891, "grad_norm": 2.3023711179533226, "learning_rate": 1.861620943060031e-05, "loss": 1.1275, "mean_token_accuracy": 0.6983560442924499, "num_tokens": 53772836.0, "step": 455 }, { "epoch": 0.25598219254312743, "grad_norm": 2.2577981667782208, "learning_rate": 1.856649870019972e-05, "loss": 1.0957, "mean_token_accuracy": 0.7056548476219178, "num_tokens": 54367700.0, "step": 460 }, { "epoch": 0.2587646076794658, "grad_norm": 2.676938686558113, "learning_rate": 1.8515979582182112e-05, "loss": 1.0906, "mean_token_accuracy": 0.707176685333252, "num_tokens": 54960810.0, "step": 465 }, { "epoch": 0.2615470228158041, "grad_norm": 2.4165806936926133, "learning_rate": 1.8464656843842837e-05, "loss": 1.0897, "mean_token_accuracy": 0.7070010900497437, "num_tokens": 55550003.0, "step": 470 }, { "epoch": 0.2643294379521425, "grad_norm": 2.461649569325264, "learning_rate": 1.8412535328311813e-05, "loss": 1.1121, "mean_token_accuracy": 0.7028052568435669, "num_tokens": 56136218.0, "step": 475 }, { "epoch": 0.2671118530884808, "grad_norm": 2.1804919862847365, "learning_rate": 1.8359619954096497e-05, "loss": 1.1076, "mean_token_accuracy": 0.7032665610313416, "num_tokens": 56726599.0, "step": 480 }, { "epoch": 0.26989426822481916, "grad_norm": 2.68554258980135, "learning_rate": 1.8305915714617745e-05, "loss": 1.0993, "mean_token_accuracy": 0.7033315062522888, "num_tokens": 57321297.0, "step": 485 }, { "epoch": 0.27267668336115747, "grad_norm": 2.3164265508749544, "learning_rate": 1.8251427677738596e-05, "loss": 1.067, "mean_token_accuracy": 0.710555636882782, "num_tokens": 57913003.0, "step": 490 }, { "epoch": 0.27545909849749584, "grad_norm": 2.25329369986598, "learning_rate": 1.8196160985286052e-05, "loss": 1.0913, "mean_token_accuracy": 0.708107590675354, "num_tokens": 58499228.0, "step": 495 }, { "epoch": 0.27824151363383415, "grad_norm": 2.2429809416951656, "learning_rate": 1.814012085256585e-05, "loss": 1.0993, "mean_token_accuracy": 0.7040925621986389, "num_tokens": 59090708.0, "step": 500 }, { "epoch": 0.2810239287701725, "grad_norm": 2.2073249069907384, "learning_rate": 1.8083312567870315e-05, "loss": 1.0879, "mean_token_accuracy": 0.7081225514411926, "num_tokens": 59685930.0, "step": 505 }, { "epoch": 0.2838063439065108, "grad_norm": 2.0857359362520214, "learning_rate": 1.8025741491979326e-05, "loss": 1.0616, "mean_token_accuracy": 0.7111712694168091, "num_tokens": 60280434.0, "step": 510 }, { "epoch": 0.2865887590428492, "grad_norm": 2.163016097659582, "learning_rate": 1.7967413057654452e-05, "loss": 1.0775, "mean_token_accuracy": 0.7096009373664856, "num_tokens": 60868682.0, "step": 515 }, { "epoch": 0.28937117417918756, "grad_norm": 2.3117437321963306, "learning_rate": 1.7908332769126255e-05, "loss": 1.1076, "mean_token_accuracy": 0.7027746677398682, "num_tokens": 61458691.0, "step": 520 }, { "epoch": 0.2921535893155259, "grad_norm": 2.4449046203168856, "learning_rate": 1.784850620157491e-05, "loss": 1.0963, "mean_token_accuracy": 0.7077104687690735, "num_tokens": 62050298.0, "step": 525 }, { "epoch": 0.29493600445186424, "grad_norm": 2.2894244206650574, "learning_rate": 1.7787939000604063e-05, "loss": 1.074, "mean_token_accuracy": 0.709146237373352, "num_tokens": 62641275.0, "step": 530 }, { "epoch": 0.29771841958820255, "grad_norm": 2.235175132800985, "learning_rate": 1.7726636881708114e-05, "loss": 1.0921, "mean_token_accuracy": 0.7072658061981201, "num_tokens": 63230436.0, "step": 535 }, { "epoch": 0.3005008347245409, "grad_norm": 2.1669106168397665, "learning_rate": 1.7664605629732832e-05, "loss": 1.0954, "mean_token_accuracy": 0.7046370029449462, "num_tokens": 63818119.0, "step": 540 }, { "epoch": 0.30328324986087923, "grad_norm": 2.1441378931657478, "learning_rate": 1.7601851098329484e-05, "loss": 1.0671, "mean_token_accuracy": 0.710686981678009, "num_tokens": 64410016.0, "step": 545 }, { "epoch": 0.3060656649972176, "grad_norm": 2.124149203635388, "learning_rate": 1.7538379209402442e-05, "loss": 1.0893, "mean_token_accuracy": 0.7079866886138916, "num_tokens": 65008878.0, "step": 550 }, { "epoch": 0.3088480801335559, "grad_norm": 2.092451847928283, "learning_rate": 1.7474195952550355e-05, "loss": 1.0911, "mean_token_accuracy": 0.7058361053466797, "num_tokens": 65591920.0, "step": 555 }, { "epoch": 0.3116304952698943, "grad_norm": 1.979401747526117, "learning_rate": 1.7409307384500932e-05, "loss": 1.0781, "mean_token_accuracy": 0.7093043208122254, "num_tokens": 66183326.0, "step": 560 }, { "epoch": 0.3144129104062326, "grad_norm": 2.3013535796680133, "learning_rate": 1.7343719628539396e-05, "loss": 1.1062, "mean_token_accuracy": 0.7034829258918762, "num_tokens": 66770419.0, "step": 565 }, { "epoch": 0.31719532554257096, "grad_norm": 2.232144210677623, "learning_rate": 1.7277438873930654e-05, "loss": 1.0888, "mean_token_accuracy": 0.7088476419448853, "num_tokens": 67356232.0, "step": 570 }, { "epoch": 0.3199777406789093, "grad_norm": 2.2615310284916426, "learning_rate": 1.7210471375335225e-05, "loss": 1.0762, "mean_token_accuracy": 0.7096709370613098, "num_tokens": 67948261.0, "step": 575 }, { "epoch": 0.32276015581524764, "grad_norm": 2.069623037881395, "learning_rate": 1.7142823452219036e-05, "loss": 1.0584, "mean_token_accuracy": 0.7133225679397583, "num_tokens": 68530816.0, "step": 580 }, { "epoch": 0.32554257095158595, "grad_norm": 2.01949110828742, "learning_rate": 1.7074501488257062e-05, "loss": 1.0771, "mean_token_accuracy": 0.7082255363464356, "num_tokens": 69121402.0, "step": 585 }, { "epoch": 0.3283249860879243, "grad_norm": 2.114712605110562, "learning_rate": 1.700551193073092e-05, "loss": 1.0434, "mean_token_accuracy": 0.7137895464897156, "num_tokens": 69707900.0, "step": 590 }, { "epoch": 0.3311074012242627, "grad_norm": 2.1204083275143533, "learning_rate": 1.693586128992048e-05, "loss": 1.0753, "mean_token_accuracy": 0.7090141296386718, "num_tokens": 70297299.0, "step": 595 }, { "epoch": 0.333889816360601, "grad_norm": 2.170713633905745, "learning_rate": 1.6865556138489497e-05, "loss": 1.0944, "mean_token_accuracy": 0.706296420097351, "num_tokens": 70886257.0, "step": 600 }, { "epoch": 0.33667223149693937, "grad_norm": 2.1428343367458074, "learning_rate": 1.6794603110865396e-05, "loss": 1.0871, "mean_token_accuracy": 0.7076637268066406, "num_tokens": 71474356.0, "step": 605 }, { "epoch": 0.3394546466332777, "grad_norm": 3.3075272432648886, "learning_rate": 1.672300890261317e-05, "loss": 1.044, "mean_token_accuracy": 0.7172706961631775, "num_tokens": 72059816.0, "step": 610 }, { "epoch": 0.34223706176961605, "grad_norm": 2.1270286082703573, "learning_rate": 1.6650780269803587e-05, "loss": 1.0863, "mean_token_accuracy": 0.7074844360351562, "num_tokens": 72652774.0, "step": 615 }, { "epoch": 0.34501947690595436, "grad_norm": 2.100968885315603, "learning_rate": 1.6577924028375622e-05, "loss": 1.0677, "mean_token_accuracy": 0.71006840467453, "num_tokens": 73239819.0, "step": 620 }, { "epoch": 0.3478018920422927, "grad_norm": 2.008632532868866, "learning_rate": 1.6504447053493264e-05, "loss": 1.0645, "mean_token_accuracy": 0.7101643443107605, "num_tokens": 73831159.0, "step": 625 }, { "epoch": 0.35058430717863104, "grad_norm": 2.0858240283477545, "learning_rate": 1.643035627889674e-05, "loss": 1.0717, "mean_token_accuracy": 0.7094730496406555, "num_tokens": 74422688.0, "step": 630 }, { "epoch": 0.3533667223149694, "grad_norm": 2.1384864549062197, "learning_rate": 1.63556586962482e-05, "loss": 1.1, "mean_token_accuracy": 0.7050098419189453, "num_tokens": 75009215.0, "step": 635 }, { "epoch": 0.3561491374513077, "grad_norm": 2.106464767907768, "learning_rate": 1.628036135447194e-05, "loss": 1.0894, "mean_token_accuracy": 0.707071328163147, "num_tokens": 75598228.0, "step": 640 }, { "epoch": 0.3589315525876461, "grad_norm": 2.0551179685857144, "learning_rate": 1.6204471359089224e-05, "loss": 1.0785, "mean_token_accuracy": 0.7078182816505432, "num_tokens": 76186740.0, "step": 645 }, { "epoch": 0.3617139677239844, "grad_norm": 2.1646737527032314, "learning_rate": 1.612799587154777e-05, "loss": 1.07, "mean_token_accuracy": 0.7111572623252869, "num_tokens": 76774832.0, "step": 650 }, { "epoch": 0.36449638286032277, "grad_norm": 2.104548936492404, "learning_rate": 1.6050942108545938e-05, "loss": 1.0747, "mean_token_accuracy": 0.7105032086372376, "num_tokens": 77363315.0, "step": 655 }, { "epoch": 0.3672787979966611, "grad_norm": 2.0651187265677216, "learning_rate": 1.5973317341351725e-05, "loss": 1.0697, "mean_token_accuracy": 0.7097868919372559, "num_tokens": 77951799.0, "step": 660 }, { "epoch": 0.37006121313299944, "grad_norm": 1.9772941125582544, "learning_rate": 1.58951288951166e-05, "loss": 1.0703, "mean_token_accuracy": 0.7106229305267334, "num_tokens": 78551404.0, "step": 665 }, { "epoch": 0.3728436282693378, "grad_norm": 2.081741954302233, "learning_rate": 1.5816384148184273e-05, "loss": 1.0564, "mean_token_accuracy": 0.7130509853363037, "num_tokens": 79148333.0, "step": 670 }, { "epoch": 0.3756260434056761, "grad_norm": 2.116111905318221, "learning_rate": 1.57370905313944e-05, "loss": 1.0901, "mean_token_accuracy": 0.7071909785270691, "num_tokens": 79731743.0, "step": 675 }, { "epoch": 0.3784084585420145, "grad_norm": 2.010006167198628, "learning_rate": 1.5657255527381395e-05, "loss": 1.0741, "mean_token_accuracy": 0.7091400980949402, "num_tokens": 80332028.0, "step": 680 }, { "epoch": 0.3811908736783528, "grad_norm": 2.0107259580616956, "learning_rate": 1.5576886669868297e-05, "loss": 1.0492, "mean_token_accuracy": 0.7131890416145324, "num_tokens": 80923863.0, "step": 685 }, { "epoch": 0.38397328881469117, "grad_norm": 1.9791050603322653, "learning_rate": 1.5495991542955855e-05, "loss": 1.0503, "mean_token_accuracy": 0.7160694479942322, "num_tokens": 81512560.0, "step": 690 }, { "epoch": 0.3867557039510295, "grad_norm": 1.9490662087580195, "learning_rate": 1.541457778040684e-05, "loss": 1.0529, "mean_token_accuracy": 0.7135980725288391, "num_tokens": 82097379.0, "step": 695 }, { "epoch": 0.38953811908736785, "grad_norm": 2.0179615858909377, "learning_rate": 1.5332653064925683e-05, "loss": 1.0519, "mean_token_accuracy": 0.7147277235984802, "num_tokens": 82685268.0, "step": 700 }, { "epoch": 0.39232053422370616, "grad_norm": 2.0450453857514175, "learning_rate": 1.5250225127433485e-05, "loss": 1.043, "mean_token_accuracy": 0.7144908547401428, "num_tokens": 83277230.0, "step": 705 }, { "epoch": 0.39510294936004453, "grad_norm": 1.9276469987768912, "learning_rate": 1.5167301746338466e-05, "loss": 1.0784, "mean_token_accuracy": 0.7108999609947204, "num_tokens": 83861406.0, "step": 710 }, { "epoch": 0.39788536449638284, "grad_norm": 1.9433023381927899, "learning_rate": 1.5083890746801962e-05, "loss": 1.0597, "mean_token_accuracy": 0.7121692419052124, "num_tokens": 84459146.0, "step": 715 }, { "epoch": 0.4006677796327212, "grad_norm": 1.9546874431863348, "learning_rate": 1.5000000000000002e-05, "loss": 1.0919, "mean_token_accuracy": 0.7079582571983337, "num_tokens": 85049969.0, "step": 720 }, { "epoch": 0.4034501947690595, "grad_norm": 1.9563600752313475, "learning_rate": 1.491563742238051e-05, "loss": 1.0692, "mean_token_accuracy": 0.7110470652580261, "num_tokens": 85638302.0, "step": 725 }, { "epoch": 0.4062326099053979, "grad_norm": 1.9290273436145757, "learning_rate": 1.483081097491628e-05, "loss": 1.0697, "mean_token_accuracy": 0.7116230726242065, "num_tokens": 86229643.0, "step": 730 }, { "epoch": 0.4090150250417362, "grad_norm": 1.9858611430728594, "learning_rate": 1.4745528662353728e-05, "loss": 1.0483, "mean_token_accuracy": 0.7151864290237426, "num_tokens": 86822104.0, "step": 735 }, { "epoch": 0.41179744017807457, "grad_norm": 2.04642483648034, "learning_rate": 1.4659798532457497e-05, "loss": 1.0775, "mean_token_accuracy": 0.7090552926063538, "num_tokens": 87413792.0, "step": 740 }, { "epoch": 0.41457985531441294, "grad_norm": 1.9893794431614882, "learning_rate": 1.4573628675251051e-05, "loss": 1.05, "mean_token_accuracy": 0.7146545886993408, "num_tokens": 88001772.0, "step": 745 }, { "epoch": 0.41736227045075125, "grad_norm": 1.9071569454284205, "learning_rate": 1.4487027222253216e-05, "loss": 1.071, "mean_token_accuracy": 0.7112080335617066, "num_tokens": 88586368.0, "step": 750 }, { "epoch": 0.4201446855870896, "grad_norm": 2.1261499946440106, "learning_rate": 1.4400002345710871e-05, "loss": 1.053, "mean_token_accuracy": 0.7138799786567688, "num_tokens": 89169649.0, "step": 755 }, { "epoch": 0.42292710072342793, "grad_norm": 2.0601232013853354, "learning_rate": 1.4312562257827742e-05, "loss": 1.0592, "mean_token_accuracy": 0.7137506484985352, "num_tokens": 89758883.0, "step": 760 }, { "epoch": 0.4257095158597663, "grad_norm": 2.0053263574303126, "learning_rate": 1.4224715209989463e-05, "loss": 1.0762, "mean_token_accuracy": 0.7106667995452881, "num_tokens": 90343260.0, "step": 765 }, { "epoch": 0.4284919309961046, "grad_norm": 2.040531463590581, "learning_rate": 1.4136469491984913e-05, "loss": 1.0532, "mean_token_accuracy": 0.7144197583198547, "num_tokens": 90931881.0, "step": 770 }, { "epoch": 0.431274346132443, "grad_norm": 2.1902642918655353, "learning_rate": 1.4047833431223938e-05, "loss": 1.0688, "mean_token_accuracy": 0.7094583511352539, "num_tokens": 91515784.0, "step": 775 }, { "epoch": 0.4340567612687813, "grad_norm": 2.128849928536235, "learning_rate": 1.3958815391951552e-05, "loss": 1.0675, "mean_token_accuracy": 0.7103721380233765, "num_tokens": 92113098.0, "step": 780 }, { "epoch": 0.43683917640511966, "grad_norm": 1.9500214149444213, "learning_rate": 1.3869423774458594e-05, "loss": 1.0728, "mean_token_accuracy": 0.7097015857696534, "num_tokens": 92709566.0, "step": 785 }, { "epoch": 0.43962159154145797, "grad_norm": 1.8764892174897658, "learning_rate": 1.3779667014289067e-05, "loss": 1.0431, "mean_token_accuracy": 0.7169391632080078, "num_tokens": 93292537.0, "step": 790 }, { "epoch": 0.44240400667779634, "grad_norm": 2.10238424716131, "learning_rate": 1.3689553581444069e-05, "loss": 1.0145, "mean_token_accuracy": 0.7227911353111267, "num_tokens": 93878784.0, "step": 795 }, { "epoch": 0.44518642181413465, "grad_norm": 2.0641117161806033, "learning_rate": 1.3599091979582537e-05, "loss": 1.0576, "mean_token_accuracy": 0.7129832863807678, "num_tokens": 94467072.0, "step": 800 }, { "epoch": 0.447968836950473, "grad_norm": 2.0209262573363143, "learning_rate": 1.3508290745218789e-05, "loss": 1.0281, "mean_token_accuracy": 0.7192481160163879, "num_tokens": 95055139.0, "step": 805 }, { "epoch": 0.4507512520868113, "grad_norm": 1.9960033545510802, "learning_rate": 1.341715844691695e-05, "loss": 1.0381, "mean_token_accuracy": 0.7170910716056824, "num_tokens": 95643923.0, "step": 810 }, { "epoch": 0.4535336672231497, "grad_norm": 2.666908392602128, "learning_rate": 1.3325703684482383e-05, "loss": 1.0911, "mean_token_accuracy": 0.7066366791725158, "num_tokens": 96229319.0, "step": 815 }, { "epoch": 0.45631608235948806, "grad_norm": 1.9412627867610472, "learning_rate": 1.3233935088150154e-05, "loss": 1.044, "mean_token_accuracy": 0.7168261289596558, "num_tokens": 96825493.0, "step": 820 }, { "epoch": 0.4590984974958264, "grad_norm": 1.8919553931638313, "learning_rate": 1.3141861317770628e-05, "loss": 1.0856, "mean_token_accuracy": 0.708315372467041, "num_tokens": 97415636.0, "step": 825 }, { "epoch": 0.46188091263216474, "grad_norm": 1.979892135995854, "learning_rate": 1.3049491061992274e-05, "loss": 1.0411, "mean_token_accuracy": 0.716647469997406, "num_tokens": 98008396.0, "step": 830 }, { "epoch": 0.46466332776850305, "grad_norm": 1.941207079118909, "learning_rate": 1.2956833037441756e-05, "loss": 1.0489, "mean_token_accuracy": 0.7146740078926086, "num_tokens": 98593026.0, "step": 835 }, { "epoch": 0.4674457429048414, "grad_norm": 2.184633389681683, "learning_rate": 1.2863895987901364e-05, "loss": 1.0746, "mean_token_accuracy": 0.7111501693725586, "num_tokens": 99185818.0, "step": 840 }, { "epoch": 0.47022815804117973, "grad_norm": 1.9997670534792031, "learning_rate": 1.2770688683483914e-05, "loss": 1.0701, "mean_token_accuracy": 0.708341383934021, "num_tokens": 99774152.0, "step": 845 }, { "epoch": 0.4730105731775181, "grad_norm": 2.0094775586736953, "learning_rate": 1.2677219919805137e-05, "loss": 1.0455, "mean_token_accuracy": 0.7151992082595825, "num_tokens": 100363649.0, "step": 850 }, { "epoch": 0.4757929883138564, "grad_norm": 2.159247041588428, "learning_rate": 1.2583498517153662e-05, "loss": 1.0338, "mean_token_accuracy": 0.7189494609832764, "num_tokens": 100957067.0, "step": 855 }, { "epoch": 0.4785754034501948, "grad_norm": 2.028288393918697, "learning_rate": 1.2489533319658703e-05, "loss": 1.0394, "mean_token_accuracy": 0.7162809491157531, "num_tokens": 101549170.0, "step": 860 }, { "epoch": 0.4813578185865331, "grad_norm": 2.03306725822367, "learning_rate": 1.2395333194455444e-05, "loss": 1.0468, "mean_token_accuracy": 0.7151136279106141, "num_tokens": 102142380.0, "step": 865 }, { "epoch": 0.48414023372287146, "grad_norm": 2.1682696530364374, "learning_rate": 1.2300907030848307e-05, "loss": 1.0554, "mean_token_accuracy": 0.7153695344924926, "num_tokens": 102734295.0, "step": 870 }, { "epoch": 0.4869226488592098, "grad_norm": 2.019722631206529, "learning_rate": 1.2206263739472085e-05, "loss": 1.0397, "mean_token_accuracy": 0.7160439848899841, "num_tokens": 103319783.0, "step": 875 }, { "epoch": 0.48970506399554814, "grad_norm": 1.9817132841610883, "learning_rate": 1.2111412251451085e-05, "loss": 1.0487, "mean_token_accuracy": 0.7163015246391297, "num_tokens": 103911953.0, "step": 880 }, { "epoch": 0.49248747913188645, "grad_norm": 1.924001725795815, "learning_rate": 1.2016361517556334e-05, "loss": 1.0267, "mean_token_accuracy": 0.7179745554924011, "num_tokens": 104499490.0, "step": 885 }, { "epoch": 0.4952698942682248, "grad_norm": 1.9061990735413328, "learning_rate": 1.1921120507360934e-05, "loss": 1.0194, "mean_token_accuracy": 0.721126937866211, "num_tokens": 105087086.0, "step": 890 }, { "epoch": 0.4980523094045632, "grad_norm": 2.0609228249311635, "learning_rate": 1.182569820839362e-05, "loss": 1.0241, "mean_token_accuracy": 0.7190962195396423, "num_tokens": 105676890.0, "step": 895 }, { "epoch": 0.5008347245409015, "grad_norm": 2.136985315971352, "learning_rate": 1.1730103625290658e-05, "loss": 1.0727, "mean_token_accuracy": 0.7091086864471435, "num_tokens": 106260405.0, "step": 900 }, { "epoch": 0.5036171396772399, "grad_norm": 1.8617004459073545, "learning_rate": 1.1634345778946112e-05, "loss": 1.032, "mean_token_accuracy": 0.7186322927474975, "num_tokens": 106854042.0, "step": 905 }, { "epoch": 0.5063995548135782, "grad_norm": 1.9034875739816928, "learning_rate": 1.1538433705660561e-05, "loss": 1.0323, "mean_token_accuracy": 0.7186863660812378, "num_tokens": 107444483.0, "step": 910 }, { "epoch": 0.5091819699499165, "grad_norm": 2.0209462331889156, "learning_rate": 1.1442376456288402e-05, "loss": 1.0378, "mean_token_accuracy": 0.7178295731544495, "num_tokens": 108034521.0, "step": 915 }, { "epoch": 0.5119643850862549, "grad_norm": 1.9075532608687007, "learning_rate": 1.1346183095383731e-05, "loss": 1.0475, "mean_token_accuracy": 0.7155048370361328, "num_tokens": 108621638.0, "step": 920 }, { "epoch": 0.5147468002225932, "grad_norm": 2.1926421106061165, "learning_rate": 1.1249862700344969e-05, "loss": 1.0305, "mean_token_accuracy": 0.7172364115715026, "num_tokens": 109218688.0, "step": 925 }, { "epoch": 0.5175292153589316, "grad_norm": 1.9946226985852828, "learning_rate": 1.1153424360558268e-05, "loss": 1.0339, "mean_token_accuracy": 0.716673743724823, "num_tokens": 109808908.0, "step": 930 }, { "epoch": 0.5203116304952699, "grad_norm": 1.9329841963207612, "learning_rate": 1.1056877176539767e-05, "loss": 1.0291, "mean_token_accuracy": 0.7183609366416931, "num_tokens": 110396483.0, "step": 935 }, { "epoch": 0.5230940456316082, "grad_norm": 2.013170356946493, "learning_rate": 1.0960230259076819e-05, "loss": 1.0456, "mean_token_accuracy": 0.7170237064361572, "num_tokens": 110985830.0, "step": 940 }, { "epoch": 0.5258764607679466, "grad_norm": 1.8999726413056113, "learning_rate": 1.086349272836824e-05, "loss": 1.0313, "mean_token_accuracy": 0.7180803418159485, "num_tokens": 111581446.0, "step": 945 }, { "epoch": 0.528658875904285, "grad_norm": 2.0409502249425335, "learning_rate": 1.0766673713163667e-05, "loss": 1.0261, "mean_token_accuracy": 0.717565405368805, "num_tokens": 112170288.0, "step": 950 }, { "epoch": 0.5314412910406232, "grad_norm": 1.885899964182058, "learning_rate": 1.0669782349902122e-05, "loss": 1.0363, "mean_token_accuracy": 0.7165609478950501, "num_tokens": 112758548.0, "step": 955 }, { "epoch": 0.5342237061769616, "grad_norm": 1.8421326453167242, "learning_rate": 1.0572827781849835e-05, "loss": 1.0248, "mean_token_accuracy": 0.7190156698226928, "num_tokens": 113351756.0, "step": 960 }, { "epoch": 0.5370061213132999, "grad_norm": 1.9626347976920178, "learning_rate": 1.0475819158237426e-05, "loss": 1.0484, "mean_token_accuracy": 0.716618275642395, "num_tokens": 113929618.0, "step": 965 }, { "epoch": 0.5397885364496383, "grad_norm": 1.9260027594244913, "learning_rate": 1.0378765633396526e-05, "loss": 1.0122, "mean_token_accuracy": 0.7219829797744751, "num_tokens": 114511171.0, "step": 970 }, { "epoch": 0.5425709515859767, "grad_norm": 2.01788882274496, "learning_rate": 1.0281676365895939e-05, "loss": 1.0341, "mean_token_accuracy": 0.7173137307167053, "num_tokens": 115100329.0, "step": 975 }, { "epoch": 0.5453533667223149, "grad_norm": 2.0538865520683554, "learning_rate": 1.0184560517677353e-05, "loss": 1.0588, "mean_token_accuracy": 0.715462589263916, "num_tokens": 115692616.0, "step": 980 }, { "epoch": 0.5481357818586533, "grad_norm": 1.981183937899365, "learning_rate": 1.0087427253190775e-05, "loss": 1.0287, "mean_token_accuracy": 0.7187099099159241, "num_tokens": 116282977.0, "step": 985 }, { "epoch": 0.5509181969949917, "grad_norm": 1.9187211997367468, "learning_rate": 9.990285738529733e-06, "loss": 1.0103, "mean_token_accuracy": 0.7224372506141663, "num_tokens": 116867356.0, "step": 990 }, { "epoch": 0.55370061213133, "grad_norm": 1.876324102948862, "learning_rate": 9.89314514056627e-06, "loss": 0.9724, "mean_token_accuracy": 0.7300009608268738, "num_tokens": 117453088.0, "step": 995 }, { "epoch": 0.5564830272676683, "grad_norm": 1.9623303147959466, "learning_rate": 9.79601462608595e-06, "loss": 1.0035, "mean_token_accuracy": 0.7247561693191529, "num_tokens": 118045362.0, "step": 1000 }, { "epoch": 0.5592654424040067, "grad_norm": 1.8686077447526968, "learning_rate": 9.698903360922773e-06, "loss": 0.9856, "mean_token_accuracy": 0.7283125519752502, "num_tokens": 118637830.0, "step": 1005 }, { "epoch": 0.562047857540345, "grad_norm": 1.8357018834213918, "learning_rate": 9.601820509094272e-06, "loss": 1.0289, "mean_token_accuracy": 0.7204028606414795, "num_tokens": 119229320.0, "step": 1010 }, { "epoch": 0.5648302726766834, "grad_norm": 2.0538909601653286, "learning_rate": 9.504775231936716e-06, "loss": 1.0498, "mean_token_accuracy": 0.7141813278198242, "num_tokens": 119821047.0, "step": 1015 }, { "epoch": 0.5676126878130217, "grad_norm": 1.836766812630116, "learning_rate": 9.407776687240591e-06, "loss": 0.9964, "mean_token_accuracy": 0.7254474043846131, "num_tokens": 120416538.0, "step": 1020 }, { "epoch": 0.57039510294936, "grad_norm": 1.9514390647117241, "learning_rate": 9.310834028386436e-06, "loss": 1.0173, "mean_token_accuracy": 0.7226798415184021, "num_tokens": 121007161.0, "step": 1025 }, { "epoch": 0.5731775180856984, "grad_norm": 1.9399928393210226, "learning_rate": 9.213956403481037e-06, "loss": 1.0142, "mean_token_accuracy": 0.7212600111961365, "num_tokens": 121598725.0, "step": 1030 }, { "epoch": 0.5759599332220368, "grad_norm": 1.9888989034840572, "learning_rate": 9.117152954494195e-06, "loss": 1.0328, "mean_token_accuracy": 0.7186324715614318, "num_tokens": 122188007.0, "step": 1035 }, { "epoch": 0.5787423483583751, "grad_norm": 1.9479652836008858, "learning_rate": 9.020432816395993e-06, "loss": 1.0293, "mean_token_accuracy": 0.7192287445068359, "num_tokens": 122775444.0, "step": 1040 }, { "epoch": 0.5815247634947134, "grad_norm": 1.868873876658722, "learning_rate": 8.92380511629481e-06, "loss": 1.0088, "mean_token_accuracy": 0.7221626877784729, "num_tokens": 123364954.0, "step": 1045 }, { "epoch": 0.5843071786310517, "grad_norm": 1.8460354821562948, "learning_rate": 8.827278972575984e-06, "loss": 1.0034, "mean_token_accuracy": 0.7228006601333619, "num_tokens": 123959095.0, "step": 1050 }, { "epoch": 0.5870895937673901, "grad_norm": 2.027141329127932, "learning_rate": 8.730863494041379e-06, "loss": 1.0222, "mean_token_accuracy": 0.7207066774368286, "num_tokens": 124551375.0, "step": 1055 }, { "epoch": 0.5898720089037285, "grad_norm": 1.9303209158495678, "learning_rate": 8.634567779049807e-06, "loss": 1.0136, "mean_token_accuracy": 0.7226389169692993, "num_tokens": 125148583.0, "step": 1060 }, { "epoch": 0.5926544240400667, "grad_norm": 1.821089900718507, "learning_rate": 8.538400914658456e-06, "loss": 1.0157, "mean_token_accuracy": 0.7223539471626281, "num_tokens": 125739350.0, "step": 1065 }, { "epoch": 0.5954368391764051, "grad_norm": 1.9402132874471187, "learning_rate": 8.442371975765368e-06, "loss": 1.0255, "mean_token_accuracy": 0.7197723150253296, "num_tokens": 126327360.0, "step": 1070 }, { "epoch": 0.5982192543127435, "grad_norm": 4.186389331333341, "learning_rate": 8.346490024253103e-06, "loss": 0.9985, "mean_token_accuracy": 0.7242988467216491, "num_tokens": 126919726.0, "step": 1075 }, { "epoch": 0.6010016694490818, "grad_norm": 1.8722288044640503, "learning_rate": 8.250764108133562e-06, "loss": 1.018, "mean_token_accuracy": 0.720951783657074, "num_tokens": 127503663.0, "step": 1080 }, { "epoch": 0.6037840845854201, "grad_norm": 1.9837555239288165, "learning_rate": 8.15520326069421e-06, "loss": 1.0133, "mean_token_accuracy": 0.7221065282821655, "num_tokens": 128092295.0, "step": 1085 }, { "epoch": 0.6065664997217585, "grad_norm": 1.918332101066207, "learning_rate": 8.05981649964559e-06, "loss": 1.0336, "mean_token_accuracy": 0.7192610502243042, "num_tokens": 128681634.0, "step": 1090 }, { "epoch": 0.6093489148580968, "grad_norm": 1.9154193774615031, "learning_rate": 7.964612826270399e-06, "loss": 0.9984, "mean_token_accuracy": 0.7253150701522827, "num_tokens": 129276945.0, "step": 1095 }, { "epoch": 0.6121313299944352, "grad_norm": 1.9833171612274754, "learning_rate": 7.86960122457404e-06, "loss": 1.0098, "mean_token_accuracy": 0.7225422620773315, "num_tokens": 129870803.0, "step": 1100 }, { "epoch": 0.6149137451307735, "grad_norm": 1.8771784670033254, "learning_rate": 7.774790660436857e-06, "loss": 1.0001, "mean_token_accuracy": 0.7242280602455139, "num_tokens": 130461085.0, "step": 1105 }, { "epoch": 0.6176961602671118, "grad_norm": 1.98853932669659, "learning_rate": 7.680190080768046e-06, "loss": 1.006, "mean_token_accuracy": 0.7234596967697143, "num_tokens": 131044253.0, "step": 1110 }, { "epoch": 0.6204785754034502, "grad_norm": 2.4167589390741946, "learning_rate": 7.585808412661379e-06, "loss": 1.0199, "mean_token_accuracy": 0.7218466520309448, "num_tokens": 131634142.0, "step": 1115 }, { "epoch": 0.6232609905397886, "grad_norm": 1.9611913130221166, "learning_rate": 7.4916545625527745e-06, "loss": 1.011, "mean_token_accuracy": 0.7240106225013733, "num_tokens": 132217401.0, "step": 1120 }, { "epoch": 0.6260434056761269, "grad_norm": 1.8664547397127202, "learning_rate": 7.397737415379853e-06, "loss": 1.0042, "mean_token_accuracy": 0.7248145937919617, "num_tokens": 132804730.0, "step": 1125 }, { "epoch": 0.6288258208124652, "grad_norm": 1.8854684297024977, "learning_rate": 7.304065833743475e-06, "loss": 1.0112, "mean_token_accuracy": 0.7220677971839905, "num_tokens": 133395672.0, "step": 1130 }, { "epoch": 0.6316082359488036, "grad_norm": 1.986610522875934, "learning_rate": 7.210648657071433e-06, "loss": 1.0152, "mean_token_accuracy": 0.7226180791854858, "num_tokens": 133987957.0, "step": 1135 }, { "epoch": 0.6343906510851419, "grad_norm": 1.889771442380467, "learning_rate": 7.117494700784292e-06, "loss": 0.9915, "mean_token_accuracy": 0.7284766793251037, "num_tokens": 134580628.0, "step": 1140 }, { "epoch": 0.6371730662214803, "grad_norm": 1.795583208948344, "learning_rate": 7.024612755463529e-06, "loss": 1.0106, "mean_token_accuracy": 0.7225217223167419, "num_tokens": 135175991.0, "step": 1145 }, { "epoch": 0.6399554813578185, "grad_norm": 1.9767456426052468, "learning_rate": 6.9320115860219705e-06, "loss": 1.005, "mean_token_accuracy": 0.724352490901947, "num_tokens": 135760748.0, "step": 1150 }, { "epoch": 0.6427378964941569, "grad_norm": 1.9522889123008382, "learning_rate": 6.839699930876727e-06, "loss": 1.0128, "mean_token_accuracy": 0.7235522747039795, "num_tokens": 136348202.0, "step": 1155 }, { "epoch": 0.6455203116304953, "grad_norm": 1.8485871557042115, "learning_rate": 6.747686501124531e-06, "loss": 1.0202, "mean_token_accuracy": 0.7193972945213318, "num_tokens": 136939858.0, "step": 1160 }, { "epoch": 0.6483027267668336, "grad_norm": 1.784737267536378, "learning_rate": 6.655979979719744e-06, "loss": 0.9938, "mean_token_accuracy": 0.7244254350662231, "num_tokens": 137528803.0, "step": 1165 }, { "epoch": 0.6510851419031719, "grad_norm": 1.8881790480821496, "learning_rate": 6.5645890206549566e-06, "loss": 0.974, "mean_token_accuracy": 0.7322948575019836, "num_tokens": 138114576.0, "step": 1170 }, { "epoch": 0.6538675570395103, "grad_norm": 1.941045031850279, "learning_rate": 6.473522248144359e-06, "loss": 0.9798, "mean_token_accuracy": 0.7303597807884217, "num_tokens": 138701429.0, "step": 1175 }, { "epoch": 0.6566499721758486, "grad_norm": 1.9164315660360134, "learning_rate": 6.382788255809893e-06, "loss": 1.0005, "mean_token_accuracy": 0.7247542023658753, "num_tokens": 139296441.0, "step": 1180 }, { "epoch": 0.659432387312187, "grad_norm": 1.9252747772897012, "learning_rate": 6.292395605870314e-06, "loss": 0.9935, "mean_token_accuracy": 0.727207088470459, "num_tokens": 139884765.0, "step": 1185 }, { "epoch": 0.6622148024485254, "grad_norm": 2.48708952288793, "learning_rate": 6.202352828333211e-06, "loss": 0.997, "mean_token_accuracy": 0.7260267257690429, "num_tokens": 140474793.0, "step": 1190 }, { "epoch": 0.6649972175848636, "grad_norm": 1.8703391327630177, "learning_rate": 6.112668420190042e-06, "loss": 0.9826, "mean_token_accuracy": 0.7283554911613465, "num_tokens": 141064904.0, "step": 1195 }, { "epoch": 0.667779632721202, "grad_norm": 1.9501052152673526, "learning_rate": 6.023350844614344e-06, "loss": 0.9763, "mean_token_accuracy": 0.7310232162475586, "num_tokens": 141649410.0, "step": 1200 }, { "epoch": 0.6705620478575404, "grad_norm": 1.9782655835974774, "learning_rate": 5.9344085301630425e-06, "loss": 1.003, "mean_token_accuracy": 0.723707640171051, "num_tokens": 142239282.0, "step": 1205 }, { "epoch": 0.6733444629938787, "grad_norm": 1.9354136592872768, "learning_rate": 5.845849869981137e-06, "loss": 1.0027, "mean_token_accuracy": 0.7262308835983277, "num_tokens": 142827280.0, "step": 1210 }, { "epoch": 0.676126878130217, "grad_norm": 1.8791345693920576, "learning_rate": 5.757683221009625e-06, "loss": 0.9975, "mean_token_accuracy": 0.7248466491699219, "num_tokens": 143422429.0, "step": 1215 }, { "epoch": 0.6789092932665554, "grad_norm": 1.8963640356605256, "learning_rate": 5.669916903196931e-06, "loss": 1.0014, "mean_token_accuracy": 0.7251011848449707, "num_tokens": 144009015.0, "step": 1220 }, { "epoch": 0.6816917084028937, "grad_norm": 1.7618216747620736, "learning_rate": 5.58255919871374e-06, "loss": 0.9848, "mean_token_accuracy": 0.7293275952339172, "num_tokens": 144602634.0, "step": 1225 }, { "epoch": 0.6844741235392321, "grad_norm": 1.8457721051471847, "learning_rate": 5.495618351171484e-06, "loss": 0.9919, "mean_token_accuracy": 0.7272073984146118, "num_tokens": 145196052.0, "step": 1230 }, { "epoch": 0.6872565386755703, "grad_norm": 1.9450479369664486, "learning_rate": 5.409102564844393e-06, "loss": 0.9938, "mean_token_accuracy": 0.7261118292808533, "num_tokens": 145794135.0, "step": 1235 }, { "epoch": 0.6900389538119087, "grad_norm": 1.8511319642398822, "learning_rate": 5.323020003895307e-06, "loss": 0.9484, "mean_token_accuracy": 0.7359282970428467, "num_tokens": 146384348.0, "step": 1240 }, { "epoch": 0.6928213689482471, "grad_norm": 1.7695297382973636, "learning_rate": 5.237378791605249e-06, "loss": 0.9638, "mean_token_accuracy": 0.7326830267906189, "num_tokens": 146981119.0, "step": 1245 }, { "epoch": 0.6956037840845855, "grad_norm": 1.9109931194680208, "learning_rate": 5.152187009606864e-06, "loss": 0.9878, "mean_token_accuracy": 0.7266369104385376, "num_tokens": 147573298.0, "step": 1250 }, { "epoch": 0.6983861992209237, "grad_norm": 1.8838124813014612, "learning_rate": 5.067452697121773e-06, "loss": 1.0136, "mean_token_accuracy": 0.7227142214775085, "num_tokens": 148166017.0, "step": 1255 }, { "epoch": 0.7011686143572621, "grad_norm": 1.8764858816220478, "learning_rate": 4.98318385020197e-06, "loss": 0.991, "mean_token_accuracy": 0.7256438136100769, "num_tokens": 148758203.0, "step": 1260 }, { "epoch": 0.7039510294936004, "grad_norm": 1.7836760728163532, "learning_rate": 4.8993884209752364e-06, "loss": 0.9776, "mean_token_accuracy": 0.728616988658905, "num_tokens": 149343772.0, "step": 1265 }, { "epoch": 0.7067334446299388, "grad_norm": 1.859071558512736, "learning_rate": 4.81607431689475e-06, "loss": 0.9859, "mean_token_accuracy": 0.7275610089302063, "num_tokens": 149934534.0, "step": 1270 }, { "epoch": 0.7095158597662772, "grad_norm": 1.830584410956007, "learning_rate": 4.7332493999928785e-06, "loss": 0.9997, "mean_token_accuracy": 0.7258034944534302, "num_tokens": 150528868.0, "step": 1275 }, { "epoch": 0.7122982749026154, "grad_norm": 1.7872234444186852, "learning_rate": 4.6509214861392785e-06, "loss": 0.9904, "mean_token_accuracy": 0.7283051371574402, "num_tokens": 151128370.0, "step": 1280 }, { "epoch": 0.7150806900389538, "grad_norm": 1.8707290023032752, "learning_rate": 4.569098344303319e-06, "loss": 0.9715, "mean_token_accuracy": 0.7312512874603272, "num_tokens": 151722014.0, "step": 1285 }, { "epoch": 0.7178631051752922, "grad_norm": 1.847939158663493, "learning_rate": 4.487787695820991e-06, "loss": 0.973, "mean_token_accuracy": 0.7308701038360595, "num_tokens": 152312667.0, "step": 1290 }, { "epoch": 0.7206455203116305, "grad_norm": 1.7516029476618973, "learning_rate": 4.406997213666236e-06, "loss": 0.9661, "mean_token_accuracy": 0.731387734413147, "num_tokens": 152899175.0, "step": 1295 }, { "epoch": 0.7234279354479688, "grad_norm": 1.8151365859715758, "learning_rate": 4.326734521726905e-06, "loss": 0.9563, "mean_token_accuracy": 0.7346587657928467, "num_tokens": 153488259.0, "step": 1300 }, { "epoch": 0.7262103505843072, "grad_norm": 1.7688011856056145, "learning_rate": 4.24700719408531e-06, "loss": 0.975, "mean_token_accuracy": 0.7301976919174195, "num_tokens": 154076796.0, "step": 1305 }, { "epoch": 0.7289927657206455, "grad_norm": 1.9580888383075816, "learning_rate": 4.167822754303493e-06, "loss": 0.9738, "mean_token_accuracy": 0.7310252785682678, "num_tokens": 154664728.0, "step": 1310 }, { "epoch": 0.7317751808569839, "grad_norm": 1.999126181586082, "learning_rate": 4.0891886747132356e-06, "loss": 0.9824, "mean_token_accuracy": 0.7299495816230774, "num_tokens": 155254919.0, "step": 1315 }, { "epoch": 0.7345575959933222, "grad_norm": 1.921167819518032, "learning_rate": 4.011112375710958e-06, "loss": 1.0045, "mean_token_accuracy": 0.7263089060783386, "num_tokens": 155842686.0, "step": 1320 }, { "epoch": 0.7373400111296605, "grad_norm": 1.801354649641667, "learning_rate": 3.933601225057446e-06, "loss": 0.9541, "mean_token_accuracy": 0.7346353769302368, "num_tokens": 156428772.0, "step": 1325 }, { "epoch": 0.7401224262659989, "grad_norm": 1.8122280326962623, "learning_rate": 3.85666253718263e-06, "loss": 0.9565, "mean_token_accuracy": 0.7328558325767517, "num_tokens": 157015422.0, "step": 1330 }, { "epoch": 0.7429048414023373, "grad_norm": 1.839585045567707, "learning_rate": 3.7803035724953007e-06, "loss": 0.9652, "mean_token_accuracy": 0.7333778142929077, "num_tokens": 157603116.0, "step": 1335 }, { "epoch": 0.7456872565386756, "grad_norm": 1.8056794531482636, "learning_rate": 3.704531536698012e-06, "loss": 0.9576, "mean_token_accuracy": 0.7345310568809509, "num_tokens": 158186881.0, "step": 1340 }, { "epoch": 0.7484696716750139, "grad_norm": 1.9143591141126923, "learning_rate": 3.6293535801070735e-06, "loss": 0.9709, "mean_token_accuracy": 0.7322964310646057, "num_tokens": 158782774.0, "step": 1345 }, { "epoch": 0.7512520868113522, "grad_norm": 1.777185833225788, "learning_rate": 3.5547767969778355e-06, "loss": 0.9892, "mean_token_accuracy": 0.7279403567314148, "num_tokens": 159372802.0, "step": 1350 }, { "epoch": 0.7540345019476906, "grad_norm": 1.9535914643310681, "learning_rate": 3.4808082248352058e-06, "loss": 0.9802, "mean_token_accuracy": 0.7304705739021301, "num_tokens": 159960156.0, "step": 1355 }, { "epoch": 0.756816917084029, "grad_norm": 1.8788792436096216, "learning_rate": 3.40745484380956e-06, "loss": 0.9821, "mean_token_accuracy": 0.7290140271186829, "num_tokens": 160545408.0, "step": 1360 }, { "epoch": 0.7595993322203672, "grad_norm": 1.927958774997016, "learning_rate": 3.3347235759780483e-06, "loss": 0.9752, "mean_token_accuracy": 0.731472396850586, "num_tokens": 161134387.0, "step": 1365 }, { "epoch": 0.7623817473567056, "grad_norm": 1.916500311041559, "learning_rate": 3.262621284711376e-06, "loss": 0.9846, "mean_token_accuracy": 0.729660439491272, "num_tokens": 161724072.0, "step": 1370 }, { "epoch": 0.765164162493044, "grad_norm": 1.7923444423002466, "learning_rate": 3.191154774026156e-06, "loss": 0.9655, "mean_token_accuracy": 0.7318884611129761, "num_tokens": 162310073.0, "step": 1375 }, { "epoch": 0.7679465776293823, "grad_norm": 1.7760349668951476, "learning_rate": 3.1203307879428146e-06, "loss": 0.9522, "mean_token_accuracy": 0.7352138042449952, "num_tokens": 162900817.0, "step": 1380 }, { "epoch": 0.7707289927657206, "grad_norm": 1.9573185565221094, "learning_rate": 3.0501560098492056e-06, "loss": 0.9476, "mean_token_accuracy": 0.7361976623535156, "num_tokens": 163488071.0, "step": 1385 }, { "epoch": 0.773511407902059, "grad_norm": 1.8472978388798753, "learning_rate": 2.9806370618699142e-06, "loss": 0.9599, "mean_token_accuracy": 0.7325201988220215, "num_tokens": 164076411.0, "step": 1390 }, { "epoch": 0.7762938230383973, "grad_norm": 1.9039958120991338, "learning_rate": 2.911780504241354e-06, "loss": 0.955, "mean_token_accuracy": 0.7342100620269776, "num_tokens": 164665515.0, "step": 1395 }, { "epoch": 0.7790762381747357, "grad_norm": 1.8467129585219402, "learning_rate": 2.8435928346926945e-06, "loss": 0.959, "mean_token_accuracy": 0.7346114397048951, "num_tokens": 165255953.0, "step": 1400 }, { "epoch": 0.781858653311074, "grad_norm": 2.0658334017918474, "learning_rate": 2.776080487832715e-06, "loss": 0.961, "mean_token_accuracy": 0.7332920074462891, "num_tokens": 165838622.0, "step": 1405 }, { "epoch": 0.7846410684474123, "grad_norm": 1.8352945141096326, "learning_rate": 2.70924983454257e-06, "loss": 0.9963, "mean_token_accuracy": 0.7269340515136719, "num_tokens": 166431603.0, "step": 1410 }, { "epoch": 0.7874234835837507, "grad_norm": 1.7741661715418184, "learning_rate": 2.6431071813746277e-06, "loss": 0.9548, "mean_token_accuracy": 0.7333246469497681, "num_tokens": 167020566.0, "step": 1415 }, { "epoch": 0.7902058987200891, "grad_norm": 1.7577119163466035, "learning_rate": 2.5776587699573007e-06, "loss": 0.9557, "mean_token_accuracy": 0.7359763622283936, "num_tokens": 167611779.0, "step": 1420 }, { "epoch": 0.7929883138564274, "grad_norm": 1.8883403863945059, "learning_rate": 2.512910776406089e-06, "loss": 0.9714, "mean_token_accuracy": 0.7312511920928955, "num_tokens": 168191251.0, "step": 1425 }, { "epoch": 0.7957707289927657, "grad_norm": 1.8710738581171973, "learning_rate": 2.4488693107407335e-06, "loss": 0.9731, "mean_token_accuracy": 0.7300806879997254, "num_tokens": 168782303.0, "step": 1430 }, { "epoch": 0.798553144129104, "grad_norm": 1.810175185890339, "learning_rate": 2.3855404163086558e-06, "loss": 0.9595, "mean_token_accuracy": 0.7339372992515564, "num_tokens": 169372790.0, "step": 1435 }, { "epoch": 0.8013355592654424, "grad_norm": 1.8716987813342199, "learning_rate": 2.322930069214664e-06, "loss": 0.9422, "mean_token_accuracy": 0.7376594424247742, "num_tokens": 169958372.0, "step": 1440 }, { "epoch": 0.8041179744017808, "grad_norm": 1.8055779362732807, "learning_rate": 2.2610441777570104e-06, "loss": 0.9713, "mean_token_accuracy": 0.7313427925109863, "num_tokens": 170547568.0, "step": 1445 }, { "epoch": 0.806900389538119, "grad_norm": 1.8191938807882237, "learning_rate": 2.1998885818698434e-06, "loss": 0.9395, "mean_token_accuracy": 0.7381924271583558, "num_tokens": 171132579.0, "step": 1450 }, { "epoch": 0.8096828046744574, "grad_norm": 1.889036188183445, "learning_rate": 2.1394690525721275e-06, "loss": 0.9744, "mean_token_accuracy": 0.7313727378845215, "num_tokens": 171722385.0, "step": 1455 }, { "epoch": 0.8124652198107958, "grad_norm": 1.8701292804037986, "learning_rate": 2.079791291423039e-06, "loss": 0.9786, "mean_token_accuracy": 0.729002046585083, "num_tokens": 172315922.0, "step": 1460 }, { "epoch": 0.8152476349471341, "grad_norm": 1.8760160382405797, "learning_rate": 2.0208609299839465e-06, "loss": 0.9683, "mean_token_accuracy": 0.7306602478027344, "num_tokens": 172910228.0, "step": 1465 }, { "epoch": 0.8180300500834724, "grad_norm": 2.129314039319898, "learning_rate": 1.962683529286973e-06, "loss": 0.9634, "mean_token_accuracy": 0.7342103958129883, "num_tokens": 173492796.0, "step": 1470 }, { "epoch": 0.8208124652198108, "grad_norm": 1.786710036467001, "learning_rate": 1.9052645793102277e-06, "loss": 0.9646, "mean_token_accuracy": 0.7335922002792359, "num_tokens": 174076770.0, "step": 1475 }, { "epoch": 0.8235948803561491, "grad_norm": 1.895632789397937, "learning_rate": 1.8486094984597268e-06, "loss": 1.0103, "mean_token_accuracy": 0.723214328289032, "num_tokens": 174666564.0, "step": 1480 }, { "epoch": 0.8263772954924875, "grad_norm": 1.8125039443565851, "learning_rate": 1.7927236330581e-06, "loss": 0.9504, "mean_token_accuracy": 0.7362190008163452, "num_tokens": 175250981.0, "step": 1485 }, { "epoch": 0.8291597106288259, "grad_norm": 1.722786992549355, "learning_rate": 1.7376122568400533e-06, "loss": 0.9499, "mean_token_accuracy": 0.7359644174575806, "num_tokens": 175846510.0, "step": 1490 }, { "epoch": 0.8319421257651641, "grad_norm": 1.841603530171618, "learning_rate": 1.6832805704547272e-06, "loss": 0.9551, "mean_token_accuracy": 0.7342963933944702, "num_tokens": 176432352.0, "step": 1495 }, { "epoch": 0.8347245409015025, "grad_norm": 1.9002675861540066, "learning_rate": 1.6297337009749249e-06, "loss": 0.9446, "mean_token_accuracy": 0.7374125838279724, "num_tokens": 177024825.0, "step": 1500 }, { "epoch": 0.8375069560378409, "grad_norm": 1.809765975130373, "learning_rate": 1.5769767014132885e-06, "loss": 0.9544, "mean_token_accuracy": 0.7355196237564087, "num_tokens": 177612725.0, "step": 1505 }, { "epoch": 0.8402893711741792, "grad_norm": 1.75809727968389, "learning_rate": 1.5250145502454594e-06, "loss": 0.9548, "mean_token_accuracy": 0.7356468796730041, "num_tokens": 178207999.0, "step": 1510 }, { "epoch": 0.8430717863105175, "grad_norm": 1.9477117802608452, "learning_rate": 1.473852150940297e-06, "loss": 0.9501, "mean_token_accuracy": 0.7353867173194886, "num_tokens": 178792546.0, "step": 1515 }, { "epoch": 0.8458542014468559, "grad_norm": 1.8675907748201204, "learning_rate": 1.4234943314971328e-06, "loss": 0.9472, "mean_token_accuracy": 0.7378309011459351, "num_tokens": 179380874.0, "step": 1520 }, { "epoch": 0.8486366165831942, "grad_norm": 1.9839427192393002, "learning_rate": 1.373945843990192e-06, "loss": 0.9686, "mean_token_accuracy": 0.7325679302215576, "num_tokens": 179970205.0, "step": 1525 }, { "epoch": 0.8514190317195326, "grad_norm": 1.8714168212459494, "learning_rate": 1.3252113641201537e-06, "loss": 0.9532, "mean_token_accuracy": 0.7361051917076111, "num_tokens": 180566757.0, "step": 1530 }, { "epoch": 0.8542014468558708, "grad_norm": 2.088368062803997, "learning_rate": 1.2772954907729074e-06, "loss": 0.9185, "mean_token_accuracy": 0.7416197896003723, "num_tokens": 181156035.0, "step": 1535 }, { "epoch": 0.8569838619922092, "grad_norm": 1.9320065667167445, "learning_rate": 1.2302027455855969e-06, "loss": 0.9452, "mean_token_accuracy": 0.736557149887085, "num_tokens": 181740790.0, "step": 1540 }, { "epoch": 0.8597662771285476, "grad_norm": 1.8430967747183953, "learning_rate": 1.1839375725199098e-06, "loss": 0.9541, "mean_token_accuracy": 0.7358271360397339, "num_tokens": 182328713.0, "step": 1545 }, { "epoch": 0.862548692264886, "grad_norm": 1.9004590415789702, "learning_rate": 1.1385043374427341e-06, "loss": 0.9663, "mean_token_accuracy": 0.731933867931366, "num_tokens": 182919104.0, "step": 1550 }, { "epoch": 0.8653311074012242, "grad_norm": 1.738836286774547, "learning_rate": 1.0939073277141598e-06, "loss": 0.9462, "mean_token_accuracy": 0.737112843990326, "num_tokens": 183507931.0, "step": 1555 }, { "epoch": 0.8681135225375626, "grad_norm": 1.929598189537243, "learning_rate": 1.0501507517829012e-06, "loss": 0.9662, "mean_token_accuracy": 0.7332514524459839, "num_tokens": 184093155.0, "step": 1560 }, { "epoch": 0.8708959376739009, "grad_norm": 1.8032404204598453, "learning_rate": 1.0072387387891535e-06, "loss": 0.941, "mean_token_accuracy": 0.7367923140525818, "num_tokens": 184680915.0, "step": 1565 }, { "epoch": 0.8736783528102393, "grad_norm": 1.8432734129518489, "learning_rate": 9.65175338174954e-07, "loss": 0.9615, "mean_token_accuracy": 0.7350999474525451, "num_tokens": 185272303.0, "step": 1570 }, { "epoch": 0.8764607679465777, "grad_norm": 1.9082038654252018, "learning_rate": 9.239645193020386e-07, "loss": 0.969, "mean_token_accuracy": 0.7324134349822998, "num_tokens": 185865698.0, "step": 1575 }, { "epoch": 0.8792431830829159, "grad_norm": 1.8191308709884229, "learning_rate": 8.836101710772826e-07, "loss": 0.9429, "mean_token_accuracy": 0.7369024634361268, "num_tokens": 186455776.0, "step": 1580 }, { "epoch": 0.8820255982192543, "grad_norm": 1.8602027757002002, "learning_rate": 8.441161015857092e-07, "loss": 0.9621, "mean_token_accuracy": 0.7330835700035095, "num_tokens": 187049436.0, "step": 1585 }, { "epoch": 0.8848080133555927, "grad_norm": 1.8629150091039206, "learning_rate": 8.054860377311368e-07, "loss": 0.9632, "mean_token_accuracy": 0.7352335929870606, "num_tokens": 187643221.0, "step": 1590 }, { "epoch": 0.887590428491931, "grad_norm": 1.7199187404593725, "learning_rate": 7.677236248844855e-07, "loss": 0.9208, "mean_token_accuracy": 0.7412317156791687, "num_tokens": 188241695.0, "step": 1595 }, { "epoch": 0.8903728436282693, "grad_norm": 1.9691877660185322, "learning_rate": 7.308324265397837e-07, "loss": 0.9454, "mean_token_accuracy": 0.7370688557624817, "num_tokens": 188836037.0, "step": 1600 }, { "epoch": 0.8931552587646077, "grad_norm": 1.9408377445794653, "learning_rate": 6.948159239778829e-07, "loss": 0.9529, "mean_token_accuracy": 0.7338770508766175, "num_tokens": 189432028.0, "step": 1605 }, { "epoch": 0.895937673900946, "grad_norm": 1.7165506466090106, "learning_rate": 6.596775159379543e-07, "loss": 0.9539, "mean_token_accuracy": 0.7329376816749573, "num_tokens": 190027436.0, "step": 1610 }, { "epoch": 0.8987200890372844, "grad_norm": 1.8489886082758764, "learning_rate": 6.254205182967566e-07, "loss": 0.9827, "mean_token_accuracy": 0.7286684274673462, "num_tokens": 190619847.0, "step": 1615 }, { "epoch": 0.9015025041736227, "grad_norm": 1.8671715601161172, "learning_rate": 5.920481637557318e-07, "loss": 0.9519, "mean_token_accuracy": 0.7349419355392456, "num_tokens": 191212681.0, "step": 1620 }, { "epoch": 0.904284919309961, "grad_norm": 1.9345594767714265, "learning_rate": 5.59563601535943e-07, "loss": 0.9273, "mean_token_accuracy": 0.7412004351615906, "num_tokens": 191805794.0, "step": 1625 }, { "epoch": 0.9070673344462994, "grad_norm": 1.8551965637564125, "learning_rate": 5.279698970809011e-07, "loss": 0.9414, "mean_token_accuracy": 0.7379236817359924, "num_tokens": 192402500.0, "step": 1630 }, { "epoch": 0.9098497495826378, "grad_norm": 1.7354354682366766, "learning_rate": 4.972700317672829e-07, "loss": 0.9497, "mean_token_accuracy": 0.7355363965034485, "num_tokens": 192989888.0, "step": 1635 }, { "epoch": 0.9126321647189761, "grad_norm": 1.853770597272773, "learning_rate": 4.674669026236045e-07, "loss": 0.9457, "mean_token_accuracy": 0.7373492479324341, "num_tokens": 193579779.0, "step": 1640 }, { "epoch": 0.9154145798553144, "grad_norm": 1.8911816701117188, "learning_rate": 4.385633220568186e-07, "loss": 0.9575, "mean_token_accuracy": 0.7329700469970704, "num_tokens": 194172425.0, "step": 1645 }, { "epoch": 0.9181969949916527, "grad_norm": 1.810081275671363, "learning_rate": 4.1056201758693957e-07, "loss": 0.9497, "mean_token_accuracy": 0.7358971953392028, "num_tokens": 194760384.0, "step": 1650 }, { "epoch": 0.9209794101279911, "grad_norm": 1.8068796359631913, "learning_rate": 3.834656315896379e-07, "loss": 0.9349, "mean_token_accuracy": 0.7383142828941345, "num_tokens": 195354279.0, "step": 1655 }, { "epoch": 0.9237618252643295, "grad_norm": 1.7877479581883122, "learning_rate": 3.572767210469086e-07, "loss": 0.9418, "mean_token_accuracy": 0.7375067234039306, "num_tokens": 195940416.0, "step": 1660 }, { "epoch": 0.9265442404006677, "grad_norm": 1.8061020929645715, "learning_rate": 3.319977573057642e-07, "loss": 0.9361, "mean_token_accuracy": 0.7379802227020263, "num_tokens": 196528933.0, "step": 1665 }, { "epoch": 0.9293266555370061, "grad_norm": 1.7940687494892404, "learning_rate": 3.0763112584503264e-07, "loss": 0.9441, "mean_token_accuracy": 0.7372017502784729, "num_tokens": 197115460.0, "step": 1670 }, { "epoch": 0.9321090706733445, "grad_norm": 1.7726872041826656, "learning_rate": 2.841791260502402e-07, "loss": 0.9618, "mean_token_accuracy": 0.7329637885093689, "num_tokens": 197705685.0, "step": 1675 }, { "epoch": 0.9348914858096828, "grad_norm": 2.1180868450006214, "learning_rate": 2.6164397099663676e-07, "loss": 0.9656, "mean_token_accuracy": 0.7338227391242981, "num_tokens": 198291195.0, "step": 1680 }, { "epoch": 0.9376739009460211, "grad_norm": 1.875515474041982, "learning_rate": 2.4002778724034447e-07, "loss": 0.9543, "mean_token_accuracy": 0.7351009726524353, "num_tokens": 198877612.0, "step": 1685 }, { "epoch": 0.9404563160823595, "grad_norm": 1.8136225504642058, "learning_rate": 2.1933261461769772e-07, "loss": 0.9181, "mean_token_accuracy": 0.7414175033569336, "num_tokens": 199453234.0, "step": 1690 }, { "epoch": 0.9432387312186978, "grad_norm": 1.7835014898755626, "learning_rate": 1.9956040605273784e-07, "loss": 0.9749, "mean_token_accuracy": 0.730805778503418, "num_tokens": 200046406.0, "step": 1695 }, { "epoch": 0.9460211463550362, "grad_norm": 1.7517370910691499, "learning_rate": 1.8071302737293294e-07, "loss": 0.9323, "mean_token_accuracy": 0.7374993085861206, "num_tokens": 200635906.0, "step": 1700 }, { "epoch": 0.9488035614913745, "grad_norm": 1.8527935092393206, "learning_rate": 1.6279225713310088e-07, "loss": 0.9295, "mean_token_accuracy": 0.7385197877883911, "num_tokens": 201229877.0, "step": 1705 }, { "epoch": 0.9515859766277128, "grad_norm": 1.8136557164407798, "learning_rate": 1.4579978644757463e-07, "loss": 0.9471, "mean_token_accuracy": 0.7368571162223816, "num_tokens": 201819996.0, "step": 1710 }, { "epoch": 0.9543683917640512, "grad_norm": 1.9202219251983068, "learning_rate": 1.297372188306234e-07, "loss": 0.9695, "mean_token_accuracy": 0.7313903212547302, "num_tokens": 202413015.0, "step": 1715 }, { "epoch": 0.9571508069003896, "grad_norm": 1.9683140510809376, "learning_rate": 1.1460607004512681e-07, "loss": 0.9575, "mean_token_accuracy": 0.734274709224701, "num_tokens": 203004551.0, "step": 1720 }, { "epoch": 0.9599332220367279, "grad_norm": 1.9014384654161847, "learning_rate": 1.004077679595472e-07, "loss": 0.9535, "mean_token_accuracy": 0.7355141043663025, "num_tokens": 203598341.0, "step": 1725 }, { "epoch": 0.9627156371730662, "grad_norm": 1.9286046696281232, "learning_rate": 8.714365241318079e-08, "loss": 0.9554, "mean_token_accuracy": 0.7344950199127197, "num_tokens": 204184977.0, "step": 1730 }, { "epoch": 0.9654980523094046, "grad_norm": 1.848637459602703, "learning_rate": 7.481497508972313e-08, "loss": 0.9495, "mean_token_accuracy": 0.7371500611305237, "num_tokens": 204771217.0, "step": 1735 }, { "epoch": 0.9682804674457429, "grad_norm": 1.8523905866159716, "learning_rate": 6.342289939915369e-08, "loss": 0.9586, "mean_token_accuracy": 0.7342531204223632, "num_tokens": 205355942.0, "step": 1740 }, { "epoch": 0.9710628825820813, "grad_norm": 1.7373507952790055, "learning_rate": 5.2968500367951425e-08, "loss": 0.9239, "mean_token_accuracy": 0.7412109613418579, "num_tokens": 205948315.0, "step": 1745 }, { "epoch": 0.9738452977184195, "grad_norm": 1.7596019981079423, "learning_rate": 4.345276453764258e-08, "loss": 0.9212, "mean_token_accuracy": 0.7406978607177734, "num_tokens": 206543533.0, "step": 1750 }, { "epoch": 0.9766277128547579, "grad_norm": 1.9410970916478685, "learning_rate": 3.487658987171294e-08, "loss": 0.9673, "mean_token_accuracy": 0.732921814918518, "num_tokens": 207131317.0, "step": 1755 }, { "epoch": 0.9794101279910963, "grad_norm": 1.8954966093705055, "learning_rate": 2.724078567086119e-08, "loss": 0.961, "mean_token_accuracy": 0.733088493347168, "num_tokens": 207715056.0, "step": 1760 }, { "epoch": 0.9821925431274346, "grad_norm": 1.9018642175991762, "learning_rate": 2.054607249663665e-08, "loss": 0.9794, "mean_token_accuracy": 0.7320362687110901, "num_tokens": 208303816.0, "step": 1765 }, { "epoch": 0.9849749582637729, "grad_norm": 1.7650786536070042, "learning_rate": 1.4793082103435885e-08, "loss": 0.9314, "mean_token_accuracy": 0.739205515384674, "num_tokens": 208893950.0, "step": 1770 }, { "epoch": 0.9877573734001113, "grad_norm": 1.7266587239672113, "learning_rate": 9.982357378891528e-09, "loss": 0.9607, "mean_token_accuracy": 0.7332687497138977, "num_tokens": 209482604.0, "step": 1775 }, { "epoch": 0.9905397885364496, "grad_norm": 1.8691016007630499, "learning_rate": 6.114352292639902e-09, "loss": 0.9559, "mean_token_accuracy": 0.734725546836853, "num_tokens": 210067242.0, "step": 1780 }, { "epoch": 0.993322203672788, "grad_norm": 1.6438880148449324, "learning_rate": 3.1894318534819725e-09, "loss": 0.9707, "mean_token_accuracy": 0.7310842633247375, "num_tokens": 210660656.0, "step": 1785 }, { "epoch": 0.9961046188091264, "grad_norm": 1.9330784206590557, "learning_rate": 1.2078720749364447e-09, "loss": 0.9491, "mean_token_accuracy": 0.7360571622848511, "num_tokens": 211243204.0, "step": 1790 }, { "epoch": 0.9988870339454646, "grad_norm": 1.8314428771696998, "learning_rate": 1.69859949198381e-10, "loss": 0.9316, "mean_token_accuracy": 0.7387963652610778, "num_tokens": 211840329.0, "step": 1795 }, { "epoch": 1.0, "eval_loss": 0.9461386799812317, "eval_mean_token_accuracy": 0.7361341528594494, "eval_num_tokens": 212075689.0, "eval_runtime": 4.8216, "eval_samples_per_second": 207.398, "eval_steps_per_second": 3.318, "step": 1797 }, { "epoch": 1.0, "step": 1797, "total_flos": 376255241256960.0, "train_loss": 1.0248491220097444, "train_runtime": 4390.0943, "train_samples_per_second": 52.385, "train_steps_per_second": 0.409 } ], "logging_steps": 5, "max_steps": 1797, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 376255241256960.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }