{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 608, "global_step": 6076, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_entropy": 2.563803545593872, "eval_loss": 1.3544182777404785, "eval_mean_token_accuracy": 0.687368397152733, "eval_num_tokens": 0.0, "eval_runtime": 198.6529, "eval_samples_per_second": 41.917, "eval_steps_per_second": 6.987, "step": 0 }, { "entropy": 2.578546404838562, "epoch": 0.0003291639236339697, "grad_norm": 58.16326141357422, "learning_rate": 0.0, "loss": 1.4746, "mean_token_accuracy": 0.6601307392120361, "num_tokens": 2101.0, "step": 1 }, { "entropy": 2.547638326883316, "epoch": 0.0016458196181698486, "grad_norm": 50.99738311767578, "learning_rate": 2.6315789473684213e-07, "loss": 1.2383, "mean_token_accuracy": 0.7003932222723961, "num_tokens": 10649.0, "step": 5 }, { "entropy": 2.5566530227661133, "epoch": 0.0032916392363396972, "grad_norm": 38.35203552246094, "learning_rate": 5.921052631578947e-07, "loss": 1.0069, "mean_token_accuracy": 0.7398434042930603, "num_tokens": 21271.0, "step": 10 }, { "entropy": 2.452682948112488, "epoch": 0.004937458854509546, "grad_norm": 27.224767684936523, "learning_rate": 9.210526315789474e-07, "loss": 0.5295, "mean_token_accuracy": 0.8550694227218628, "num_tokens": 32112.0, "step": 15 }, { "entropy": 2.1593106389045715, "epoch": 0.0065832784726793945, "grad_norm": 23.221702575683594, "learning_rate": 1.25e-06, "loss": 0.2773, "mean_token_accuracy": 0.9244555592536926, "num_tokens": 42919.0, "step": 20 }, { "entropy": 1.6036222577095032, "epoch": 0.008229098090849244, "grad_norm": 15.987848281860352, "learning_rate": 1.5789473684210526e-06, "loss": 0.1681, "mean_token_accuracy": 0.9467954277992249, "num_tokens": 53618.0, "step": 25 }, { "entropy": 1.486020290851593, "epoch": 0.009874917709019092, "grad_norm": 11.537003517150879, "learning_rate": 1.9078947368421057e-06, "loss": 0.1446, "mean_token_accuracy": 0.9633475482463837, "num_tokens": 64755.0, "step": 30 }, { "entropy": 1.6325503945350648, "epoch": 0.01152073732718894, "grad_norm": 7.058774471282959, "learning_rate": 2.236842105263158e-06, "loss": 0.149, "mean_token_accuracy": 0.9636693239212036, "num_tokens": 75388.0, "step": 35 }, { "entropy": 1.7897157430648805, "epoch": 0.013166556945358789, "grad_norm": 6.356208324432373, "learning_rate": 2.565789473684211e-06, "loss": 0.0838, "mean_token_accuracy": 0.9791653394699097, "num_tokens": 86153.0, "step": 40 }, { "entropy": 1.7598371386528016, "epoch": 0.014812376563528637, "grad_norm": 11.368063926696777, "learning_rate": 2.8947368421052634e-06, "loss": 0.0981, "mean_token_accuracy": 0.9723430037498474, "num_tokens": 96649.0, "step": 45 }, { "entropy": 1.5412654876708984, "epoch": 0.016458196181698487, "grad_norm": 11.02826976776123, "learning_rate": 3.223684210526316e-06, "loss": 0.0802, "mean_token_accuracy": 0.9778924286365509, "num_tokens": 107209.0, "step": 50 }, { "entropy": 1.419243288040161, "epoch": 0.018104015799868336, "grad_norm": 14.368644714355469, "learning_rate": 3.5526315789473687e-06, "loss": 0.1422, "mean_token_accuracy": 0.9643596112728119, "num_tokens": 117665.0, "step": 55 }, { "entropy": 1.5144587397575378, "epoch": 0.019749835418038184, "grad_norm": 8.765934944152832, "learning_rate": 3.8815789473684214e-06, "loss": 0.1094, "mean_token_accuracy": 0.9742422461509704, "num_tokens": 128149.0, "step": 60 }, { "entropy": 1.6769550323486329, "epoch": 0.021395655036208033, "grad_norm": 9.26172161102295, "learning_rate": 4.210526315789474e-06, "loss": 0.0886, "mean_token_accuracy": 0.9756901502609253, "num_tokens": 138845.0, "step": 65 }, { "entropy": 1.804041564464569, "epoch": 0.02304147465437788, "grad_norm": 7.930450439453125, "learning_rate": 4.539473684210527e-06, "loss": 0.1117, "mean_token_accuracy": 0.9712024033069611, "num_tokens": 149695.0, "step": 70 }, { "entropy": 1.8376032829284668, "epoch": 0.02468729427254773, "grad_norm": 5.366224765777588, "learning_rate": 4.8684210526315795e-06, "loss": 0.0885, "mean_token_accuracy": 0.9742565512657165, "num_tokens": 160769.0, "step": 75 }, { "entropy": 1.8037242650985719, "epoch": 0.026333113890717578, "grad_norm": 50.815311431884766, "learning_rate": 5.197368421052632e-06, "loss": 0.0664, "mean_token_accuracy": 0.9830366194248199, "num_tokens": 171557.0, "step": 80 }, { "entropy": 1.7092326641082765, "epoch": 0.027978933508887426, "grad_norm": 22.222219467163086, "learning_rate": 5.526315789473685e-06, "loss": 0.0962, "mean_token_accuracy": 0.9750658094882965, "num_tokens": 182226.0, "step": 85 }, { "entropy": 1.6772519588470458, "epoch": 0.029624753127057275, "grad_norm": 7.502695560455322, "learning_rate": 5.855263157894738e-06, "loss": 0.1128, "mean_token_accuracy": 0.9759301841259003, "num_tokens": 193341.0, "step": 90 }, { "entropy": 1.8118021130561828, "epoch": 0.031270572745227126, "grad_norm": 4.66135311126709, "learning_rate": 6.18421052631579e-06, "loss": 0.0788, "mean_token_accuracy": 0.9772343516349793, "num_tokens": 204280.0, "step": 95 }, { "entropy": 1.7658962607383728, "epoch": 0.032916392363396975, "grad_norm": 9.973278045654297, "learning_rate": 6.513157894736842e-06, "loss": 0.0693, "mean_token_accuracy": 0.9800639986991883, "num_tokens": 214849.0, "step": 100 }, { "entropy": 1.6705022692680358, "epoch": 0.03456221198156682, "grad_norm": 12.13133430480957, "learning_rate": 6.842105263157896e-06, "loss": 0.1029, "mean_token_accuracy": 0.9713725507259369, "num_tokens": 225617.0, "step": 105 }, { "entropy": 1.6237557530403137, "epoch": 0.03620803159973667, "grad_norm": 12.934727668762207, "learning_rate": 7.1710526315789475e-06, "loss": 0.092, "mean_token_accuracy": 0.9715362131595612, "num_tokens": 236011.0, "step": 110 }, { "entropy": 1.6253692150115966, "epoch": 0.03785385121790652, "grad_norm": 5.850583076477051, "learning_rate": 7.500000000000001e-06, "loss": 0.0856, "mean_token_accuracy": 0.9789658904075622, "num_tokens": 246517.0, "step": 115 }, { "entropy": 1.699216902256012, "epoch": 0.03949967083607637, "grad_norm": 9.97748851776123, "learning_rate": 7.828947368421054e-06, "loss": 0.1076, "mean_token_accuracy": 0.9751021385192871, "num_tokens": 257039.0, "step": 120 }, { "entropy": 1.7488463997840882, "epoch": 0.04114549045424622, "grad_norm": 9.265748023986816, "learning_rate": 8.157894736842106e-06, "loss": 0.0841, "mean_token_accuracy": 0.9726714611053466, "num_tokens": 267666.0, "step": 125 }, { "entropy": 1.7969783902168275, "epoch": 0.042791310072416065, "grad_norm": 6.479616641998291, "learning_rate": 8.486842105263159e-06, "loss": 0.0759, "mean_token_accuracy": 0.9793728470802308, "num_tokens": 278047.0, "step": 130 }, { "entropy": 1.6162196516990661, "epoch": 0.044437129690585914, "grad_norm": 8.1745023727417, "learning_rate": 8.81578947368421e-06, "loss": 0.0684, "mean_token_accuracy": 0.9798145830631256, "num_tokens": 288638.0, "step": 135 }, { "entropy": 1.603439712524414, "epoch": 0.04608294930875576, "grad_norm": 11.199191093444824, "learning_rate": 9.144736842105264e-06, "loss": 0.1103, "mean_token_accuracy": 0.973408317565918, "num_tokens": 299280.0, "step": 140 }, { "entropy": 1.7563074707984925, "epoch": 0.04772876892692561, "grad_norm": 7.97125244140625, "learning_rate": 9.473684210526315e-06, "loss": 0.1014, "mean_token_accuracy": 0.9699687659740448, "num_tokens": 309815.0, "step": 145 }, { "entropy": 1.954578173160553, "epoch": 0.04937458854509546, "grad_norm": 5.959860324859619, "learning_rate": 9.80263157894737e-06, "loss": 0.0905, "mean_token_accuracy": 0.9725431621074676, "num_tokens": 320598.0, "step": 150 }, { "entropy": 1.997165060043335, "epoch": 0.05102040816326531, "grad_norm": 4.954457759857178, "learning_rate": 1.0131578947368421e-05, "loss": 0.0671, "mean_token_accuracy": 0.983484423160553, "num_tokens": 331592.0, "step": 155 }, { "entropy": 1.9534252166748047, "epoch": 0.052666227781435156, "grad_norm": 8.918072700500488, "learning_rate": 1.0460526315789474e-05, "loss": 0.0995, "mean_token_accuracy": 0.9751366376876831, "num_tokens": 342211.0, "step": 160 }, { "entropy": 1.8752238154411316, "epoch": 0.054312047399605004, "grad_norm": 8.222981452941895, "learning_rate": 1.0789473684210528e-05, "loss": 0.1081, "mean_token_accuracy": 0.9731775224208832, "num_tokens": 352894.0, "step": 165 }, { "entropy": 1.8922731161117554, "epoch": 0.05595786701777485, "grad_norm": 6.892168045043945, "learning_rate": 1.111842105263158e-05, "loss": 0.1093, "mean_token_accuracy": 0.9666784107685089, "num_tokens": 363571.0, "step": 170 }, { "entropy": 1.8998683929443358, "epoch": 0.0576036866359447, "grad_norm": 3.256152629852295, "learning_rate": 1.1447368421052632e-05, "loss": 0.0701, "mean_token_accuracy": 0.9812213480472565, "num_tokens": 374077.0, "step": 175 }, { "entropy": 1.8406203866004944, "epoch": 0.05924950625411455, "grad_norm": 5.9513983726501465, "learning_rate": 1.1776315789473684e-05, "loss": 0.0642, "mean_token_accuracy": 0.9829223692417145, "num_tokens": 384936.0, "step": 180 }, { "entropy": 1.8397523403167724, "epoch": 0.0608953258722844, "grad_norm": 5.223180770874023, "learning_rate": 1.2105263157894737e-05, "loss": 0.1066, "mean_token_accuracy": 0.9780671417713165, "num_tokens": 395309.0, "step": 185 }, { "entropy": 1.8732271909713745, "epoch": 0.06254114549045425, "grad_norm": 5.581716060638428, "learning_rate": 1.2434210526315791e-05, "loss": 0.1243, "mean_token_accuracy": 0.9716349482536316, "num_tokens": 406122.0, "step": 190 }, { "entropy": 1.8642462372779847, "epoch": 0.0641869651086241, "grad_norm": 4.00003719329834, "learning_rate": 1.2763157894736844e-05, "loss": 0.0881, "mean_token_accuracy": 0.9758350074291229, "num_tokens": 416622.0, "step": 195 }, { "entropy": 1.763167428970337, "epoch": 0.06583278472679395, "grad_norm": 4.925802707672119, "learning_rate": 1.3092105263157895e-05, "loss": 0.0865, "mean_token_accuracy": 0.9732065200805664, "num_tokens": 427374.0, "step": 200 }, { "entropy": 1.707397425174713, "epoch": 0.06747860434496379, "grad_norm": 3.0551066398620605, "learning_rate": 1.3421052631578948e-05, "loss": 0.081, "mean_token_accuracy": 0.9832678198814392, "num_tokens": 437906.0, "step": 205 }, { "entropy": 1.6210663080215455, "epoch": 0.06912442396313365, "grad_norm": 5.318910598754883, "learning_rate": 1.375e-05, "loss": 0.0959, "mean_token_accuracy": 0.978535383939743, "num_tokens": 448512.0, "step": 210 }, { "entropy": 1.7267163753509522, "epoch": 0.07077024358130349, "grad_norm": 18.240541458129883, "learning_rate": 1.4078947368421055e-05, "loss": 0.0878, "mean_token_accuracy": 0.9779551386833191, "num_tokens": 459296.0, "step": 215 }, { "entropy": 1.9074785947799682, "epoch": 0.07241606319947334, "grad_norm": 4.598775386810303, "learning_rate": 1.4407894736842108e-05, "loss": 0.1177, "mean_token_accuracy": 0.9697455763816833, "num_tokens": 469857.0, "step": 220 }, { "entropy": 2.076517927646637, "epoch": 0.07406188281764318, "grad_norm": 6.2975239753723145, "learning_rate": 1.4736842105263159e-05, "loss": 0.1191, "mean_token_accuracy": 0.9704063773155213, "num_tokens": 480679.0, "step": 225 }, { "entropy": 2.075191152095795, "epoch": 0.07570770243581304, "grad_norm": 5.611932754516602, "learning_rate": 1.5065789473684211e-05, "loss": 0.1126, "mean_token_accuracy": 0.9733084321022034, "num_tokens": 491266.0, "step": 230 }, { "entropy": 1.8972731590270997, "epoch": 0.07735352205398288, "grad_norm": 7.2364888191223145, "learning_rate": 1.5394736842105264e-05, "loss": 0.0767, "mean_token_accuracy": 0.9798118114471436, "num_tokens": 502150.0, "step": 235 }, { "entropy": 1.9421729803085328, "epoch": 0.07899934167215274, "grad_norm": 5.142597198486328, "learning_rate": 1.572368421052632e-05, "loss": 0.0984, "mean_token_accuracy": 0.9767365634441376, "num_tokens": 512761.0, "step": 240 }, { "entropy": 2.1024407386779784, "epoch": 0.08064516129032258, "grad_norm": 6.71290922164917, "learning_rate": 1.605263157894737e-05, "loss": 0.0941, "mean_token_accuracy": 0.968624371290207, "num_tokens": 523326.0, "step": 245 }, { "entropy": 2.108752429485321, "epoch": 0.08229098090849243, "grad_norm": 6.765599727630615, "learning_rate": 1.638157894736842e-05, "loss": 0.1249, "mean_token_accuracy": 0.9727495968341827, "num_tokens": 534182.0, "step": 250 }, { "entropy": 2.1626508831977844, "epoch": 0.08393680052666228, "grad_norm": 6.46060848236084, "learning_rate": 1.6710526315789475e-05, "loss": 0.0776, "mean_token_accuracy": 0.9789380371570587, "num_tokens": 544990.0, "step": 255 }, { "entropy": 2.010746192932129, "epoch": 0.08558262014483213, "grad_norm": 5.445334434509277, "learning_rate": 1.703947368421053e-05, "loss": 0.0855, "mean_token_accuracy": 0.9773862421512604, "num_tokens": 555445.0, "step": 260 }, { "entropy": 1.7812985062599183, "epoch": 0.08722843976300197, "grad_norm": 5.686192512512207, "learning_rate": 1.736842105263158e-05, "loss": 0.0891, "mean_token_accuracy": 0.975337165594101, "num_tokens": 566278.0, "step": 265 }, { "entropy": 1.6327629923820495, "epoch": 0.08887425938117183, "grad_norm": 6.93804931640625, "learning_rate": 1.769736842105263e-05, "loss": 0.1093, "mean_token_accuracy": 0.9774836421012878, "num_tokens": 577347.0, "step": 270 }, { "entropy": 1.6540838599205017, "epoch": 0.09052007899934167, "grad_norm": 4.582030773162842, "learning_rate": 1.8026315789473685e-05, "loss": 0.0712, "mean_token_accuracy": 0.9831354200839997, "num_tokens": 587873.0, "step": 275 }, { "entropy": 1.614749014377594, "epoch": 0.09216589861751152, "grad_norm": 4.63378381729126, "learning_rate": 1.835526315789474e-05, "loss": 0.0818, "mean_token_accuracy": 0.9796861112117767, "num_tokens": 598584.0, "step": 280 }, { "entropy": 1.7127745985984801, "epoch": 0.09381171823568137, "grad_norm": 5.947144508361816, "learning_rate": 1.868421052631579e-05, "loss": 0.0864, "mean_token_accuracy": 0.9765213131904602, "num_tokens": 608860.0, "step": 285 }, { "entropy": 1.871683084964752, "epoch": 0.09545753785385122, "grad_norm": 7.703008651733398, "learning_rate": 1.9013157894736845e-05, "loss": 0.0671, "mean_token_accuracy": 0.9811056196689606, "num_tokens": 619377.0, "step": 290 }, { "entropy": 1.9299902677536012, "epoch": 0.09710335747202106, "grad_norm": 6.416894912719727, "learning_rate": 1.9342105263157896e-05, "loss": 0.089, "mean_token_accuracy": 0.9779207348823548, "num_tokens": 630207.0, "step": 295 }, { "entropy": 2.0591522693634032, "epoch": 0.09874917709019092, "grad_norm": 2.4779272079467773, "learning_rate": 1.9671052631578947e-05, "loss": 0.0552, "mean_token_accuracy": 0.9879588544368744, "num_tokens": 640467.0, "step": 300 }, { "entropy": 2.069437396526337, "epoch": 0.10039499670836076, "grad_norm": 6.207917213439941, "learning_rate": 2e-05, "loss": 0.1163, "mean_token_accuracy": 0.9745943665504455, "num_tokens": 651341.0, "step": 305 }, { "entropy": 2.21907696723938, "epoch": 0.10204081632653061, "grad_norm": 4.81263542175293, "learning_rate": 1.9999962969732823e-05, "loss": 0.0853, "mean_token_accuracy": 0.9776013195514679, "num_tokens": 661968.0, "step": 310 }, { "entropy": 2.249755620956421, "epoch": 0.10368663594470046, "grad_norm": 5.646886348724365, "learning_rate": 1.999985187920555e-05, "loss": 0.114, "mean_token_accuracy": 0.9764432370662689, "num_tokens": 672739.0, "step": 315 }, { "entropy": 2.1538678646087646, "epoch": 0.10533245556287031, "grad_norm": 6.742599010467529, "learning_rate": 1.9999666729240908e-05, "loss": 0.0768, "mean_token_accuracy": 0.9786048173904419, "num_tokens": 683514.0, "step": 320 }, { "entropy": 1.9418102025985717, "epoch": 0.10697827518104015, "grad_norm": 5.256528854370117, "learning_rate": 1.9999407521210143e-05, "loss": 0.091, "mean_token_accuracy": 0.9797745287418366, "num_tokens": 694254.0, "step": 325 }, { "entropy": 1.8409188032150268, "epoch": 0.10862409479921001, "grad_norm": 6.152730464935303, "learning_rate": 1.9999074257032953e-05, "loss": 0.0924, "mean_token_accuracy": 0.9798021256923676, "num_tokens": 704933.0, "step": 330 }, { "entropy": 1.9683008909225463, "epoch": 0.11026991441737985, "grad_norm": 6.580195903778076, "learning_rate": 1.9998666939177514e-05, "loss": 0.1345, "mean_token_accuracy": 0.9660695493221283, "num_tokens": 715336.0, "step": 335 }, { "entropy": 2.197187530994415, "epoch": 0.1119157340355497, "grad_norm": 5.056039810180664, "learning_rate": 1.9998185570660445e-05, "loss": 0.1243, "mean_token_accuracy": 0.9728079676628113, "num_tokens": 726034.0, "step": 340 }, { "entropy": 2.3285026073455812, "epoch": 0.11356155365371955, "grad_norm": 6.029480934143066, "learning_rate": 1.9997630155046784e-05, "loss": 0.1302, "mean_token_accuracy": 0.9665205538272857, "num_tokens": 737083.0, "step": 345 }, { "entropy": 2.1871792316436767, "epoch": 0.1152073732718894, "grad_norm": 5.840641498565674, "learning_rate": 1.9997000696449973e-05, "loss": 0.087, "mean_token_accuracy": 0.9797586441040039, "num_tokens": 747987.0, "step": 350 }, { "entropy": 2.055765151977539, "epoch": 0.11685319289005924, "grad_norm": 10.675565719604492, "learning_rate": 1.9996297199531813e-05, "loss": 0.1199, "mean_token_accuracy": 0.9669549405574799, "num_tokens": 758532.0, "step": 355 }, { "entropy": 2.0415255546569826, "epoch": 0.1184990125082291, "grad_norm": 6.482625961303711, "learning_rate": 1.9995519669502438e-05, "loss": 0.1343, "mean_token_accuracy": 0.9686403274536133, "num_tokens": 769143.0, "step": 360 }, { "entropy": 2.146265912055969, "epoch": 0.12014483212639894, "grad_norm": 6.276276588439941, "learning_rate": 1.9994668112120283e-05, "loss": 0.0893, "mean_token_accuracy": 0.9802504718303681, "num_tokens": 779714.0, "step": 365 }, { "entropy": 2.267709231376648, "epoch": 0.1217906517445688, "grad_norm": 3.6624529361724854, "learning_rate": 1.999374253369202e-05, "loss": 0.1289, "mean_token_accuracy": 0.967824923992157, "num_tokens": 790409.0, "step": 370 }, { "entropy": 2.3208181142807005, "epoch": 0.12343647136273865, "grad_norm": 8.087434768676758, "learning_rate": 1.999274294107254e-05, "loss": 0.0958, "mean_token_accuracy": 0.9716035008430481, "num_tokens": 801020.0, "step": 375 }, { "entropy": 2.2529816150665285, "epoch": 0.1250822909809085, "grad_norm": 6.710372447967529, "learning_rate": 1.9991669341664873e-05, "loss": 0.1048, "mean_token_accuracy": 0.971696001291275, "num_tokens": 811861.0, "step": 380 }, { "entropy": 2.1597923040390015, "epoch": 0.12672811059907835, "grad_norm": 3.219034194946289, "learning_rate": 1.9990521743420156e-05, "loss": 0.1137, "mean_token_accuracy": 0.9790872097015381, "num_tokens": 822548.0, "step": 385 }, { "entropy": 2.06735520362854, "epoch": 0.1283739302172482, "grad_norm": 2.9641964435577393, "learning_rate": 1.9989300154837564e-05, "loss": 0.0935, "mean_token_accuracy": 0.9800699293613434, "num_tokens": 833368.0, "step": 390 }, { "entropy": 2.0970317363739013, "epoch": 0.13001974983541803, "grad_norm": 4.221366882324219, "learning_rate": 1.9988004584964243e-05, "loss": 0.0939, "mean_token_accuracy": 0.9755605041980744, "num_tokens": 844003.0, "step": 395 }, { "entropy": 2.073064410686493, "epoch": 0.1316655694535879, "grad_norm": 7.461073398590088, "learning_rate": 1.9986635043395258e-05, "loss": 0.1238, "mean_token_accuracy": 0.9669863104820251, "num_tokens": 854150.0, "step": 400 }, { "entropy": 1.975202488899231, "epoch": 0.13331138907175774, "grad_norm": 6.416271209716797, "learning_rate": 1.9985191540273506e-05, "loss": 0.0933, "mean_token_accuracy": 0.9760487675666809, "num_tokens": 864751.0, "step": 405 }, { "entropy": 1.9307495832443238, "epoch": 0.13495720868992758, "grad_norm": 6.192378044128418, "learning_rate": 1.9983674086289647e-05, "loss": 0.0756, "mean_token_accuracy": 0.9799009144306183, "num_tokens": 875364.0, "step": 410 }, { "entropy": 1.9320047855377198, "epoch": 0.13660302830809742, "grad_norm": 4.568058013916016, "learning_rate": 1.9982082692682027e-05, "loss": 0.0647, "mean_token_accuracy": 0.9817326724529266, "num_tokens": 885830.0, "step": 415 }, { "entropy": 1.8869742393493651, "epoch": 0.1382488479262673, "grad_norm": 6.1201019287109375, "learning_rate": 1.998041737123659e-05, "loss": 0.1027, "mean_token_accuracy": 0.9717364609241486, "num_tokens": 896622.0, "step": 420 }, { "entropy": 1.9822751045227052, "epoch": 0.13989466754443713, "grad_norm": 4.20133638381958, "learning_rate": 1.9978678134286796e-05, "loss": 0.1138, "mean_token_accuracy": 0.9745773077011108, "num_tokens": 907113.0, "step": 425 }, { "entropy": 2.109739363193512, "epoch": 0.14154048716260698, "grad_norm": 4.79110860824585, "learning_rate": 1.997686499471353e-05, "loss": 0.0759, "mean_token_accuracy": 0.9797537565231323, "num_tokens": 917765.0, "step": 430 }, { "entropy": 2.127437674999237, "epoch": 0.14318630678077682, "grad_norm": 7.135578632354736, "learning_rate": 1.9974977965945e-05, "loss": 0.0929, "mean_token_accuracy": 0.9721114039421082, "num_tokens": 928469.0, "step": 435 }, { "entropy": 1.9786985039710998, "epoch": 0.1448321263989467, "grad_norm": 5.621862888336182, "learning_rate": 1.9973017061956638e-05, "loss": 0.1085, "mean_token_accuracy": 0.9724132001399994, "num_tokens": 939212.0, "step": 440 }, { "entropy": 1.7866165399551392, "epoch": 0.14647794601711653, "grad_norm": 3.2050716876983643, "learning_rate": 1.9970982297271007e-05, "loss": 0.0794, "mean_token_accuracy": 0.978398722410202, "num_tokens": 950445.0, "step": 445 }, { "entropy": 1.7645732879638671, "epoch": 0.14812376563528637, "grad_norm": 6.321614742279053, "learning_rate": 1.996887368695768e-05, "loss": 0.0907, "mean_token_accuracy": 0.9744451761245727, "num_tokens": 961106.0, "step": 450 }, { "entropy": 1.7317794203758239, "epoch": 0.1497695852534562, "grad_norm": 4.161308765411377, "learning_rate": 1.9966691246633143e-05, "loss": 0.0625, "mean_token_accuracy": 0.9846729040145874, "num_tokens": 971898.0, "step": 455 }, { "entropy": 1.742343807220459, "epoch": 0.15141540487162608, "grad_norm": 4.4447503089904785, "learning_rate": 1.9964434992460662e-05, "loss": 0.0797, "mean_token_accuracy": 0.9820732772350311, "num_tokens": 982441.0, "step": 460 }, { "entropy": 1.8356537818908691, "epoch": 0.15306122448979592, "grad_norm": 10.55756664276123, "learning_rate": 1.9962104941150177e-05, "loss": 0.1023, "mean_token_accuracy": 0.9745754778385163, "num_tokens": 993128.0, "step": 465 }, { "entropy": 2.004080033302307, "epoch": 0.15470704410796576, "grad_norm": 3.8827741146087646, "learning_rate": 1.995970110995817e-05, "loss": 0.0735, "mean_token_accuracy": 0.9818957507610321, "num_tokens": 1003682.0, "step": 470 }, { "entropy": 2.0348508238792418, "epoch": 0.1563528637261356, "grad_norm": 5.195158958435059, "learning_rate": 1.9957223516687545e-05, "loss": 0.0934, "mean_token_accuracy": 0.9747053384780884, "num_tokens": 1014255.0, "step": 475 }, { "entropy": 2.0494486331939696, "epoch": 0.15799868334430547, "grad_norm": 6.356441497802734, "learning_rate": 1.995467217968749e-05, "loss": 0.1203, "mean_token_accuracy": 0.9745293378829956, "num_tokens": 1024794.0, "step": 480 }, { "entropy": 2.107176351547241, "epoch": 0.15964450296247532, "grad_norm": 4.578567028045654, "learning_rate": 1.9952047117853345e-05, "loss": 0.0774, "mean_token_accuracy": 0.9803178489208222, "num_tokens": 1035500.0, "step": 485 }, { "entropy": 2.1053681492805483, "epoch": 0.16129032258064516, "grad_norm": 3.887092113494873, "learning_rate": 1.9949348350626456e-05, "loss": 0.0903, "mean_token_accuracy": 0.9732440710067749, "num_tokens": 1046221.0, "step": 490 }, { "entropy": 2.17068989276886, "epoch": 0.162936142198815, "grad_norm": 8.976771354675293, "learning_rate": 1.9946575897994042e-05, "loss": 0.0549, "mean_token_accuracy": 0.9809908986091613, "num_tokens": 1056570.0, "step": 495 }, { "entropy": 2.05935001373291, "epoch": 0.16458196181698487, "grad_norm": 4.243111610412598, "learning_rate": 1.994372978048903e-05, "loss": 0.0691, "mean_token_accuracy": 0.9819040298461914, "num_tokens": 1067196.0, "step": 500 }, { "entropy": 2.047864282131195, "epoch": 0.1662277814351547, "grad_norm": 3.212517738342285, "learning_rate": 1.9940810019189912e-05, "loss": 0.0839, "mean_token_accuracy": 0.9838090300559997, "num_tokens": 1077680.0, "step": 505 }, { "entropy": 2.0122323632240295, "epoch": 0.16787360105332455, "grad_norm": 7.405489444732666, "learning_rate": 1.9937816635720614e-05, "loss": 0.1078, "mean_token_accuracy": 0.9703534841537476, "num_tokens": 1088097.0, "step": 510 }, { "entropy": 2.0119399070739745, "epoch": 0.1695194206714944, "grad_norm": 5.046123027801514, "learning_rate": 1.9934749652250275e-05, "loss": 0.085, "mean_token_accuracy": 0.9790121138095855, "num_tokens": 1098795.0, "step": 515 }, { "entropy": 2.0802261352539064, "epoch": 0.17116524028966426, "grad_norm": 4.109484672546387, "learning_rate": 1.9931609091493154e-05, "loss": 0.0873, "mean_token_accuracy": 0.9734878361225128, "num_tokens": 1109471.0, "step": 520 }, { "entropy": 2.095242714881897, "epoch": 0.1728110599078341, "grad_norm": 3.953889846801758, "learning_rate": 1.9928394976708403e-05, "loss": 0.089, "mean_token_accuracy": 0.9813098549842835, "num_tokens": 1119968.0, "step": 525 }, { "entropy": 2.2089867115020754, "epoch": 0.17445687952600394, "grad_norm": 5.200376510620117, "learning_rate": 1.9925107331699928e-05, "loss": 0.1003, "mean_token_accuracy": 0.9750366508960724, "num_tokens": 1130579.0, "step": 530 }, { "entropy": 2.2357950687408445, "epoch": 0.17610269914417379, "grad_norm": 5.884069919586182, "learning_rate": 1.992174618081621e-05, "loss": 0.0867, "mean_token_accuracy": 0.979562520980835, "num_tokens": 1140933.0, "step": 535 }, { "entropy": 2.1410197973251344, "epoch": 0.17774851876234365, "grad_norm": 6.01693058013916, "learning_rate": 1.9918311548950102e-05, "loss": 0.0957, "mean_token_accuracy": 0.9744680047035217, "num_tokens": 1151887.0, "step": 540 }, { "entropy": 2.228221225738525, "epoch": 0.1793943383805135, "grad_norm": 3.3614420890808105, "learning_rate": 1.991480346153868e-05, "loss": 0.0887, "mean_token_accuracy": 0.9763470590114594, "num_tokens": 1162403.0, "step": 545 }, { "entropy": 2.1968831777572633, "epoch": 0.18104015799868334, "grad_norm": 4.788167476654053, "learning_rate": 1.9911221944563022e-05, "loss": 0.0929, "mean_token_accuracy": 0.9781689345836639, "num_tokens": 1173014.0, "step": 550 }, { "entropy": 2.0954206466674803, "epoch": 0.18268597761685318, "grad_norm": 9.105097770690918, "learning_rate": 1.9907567024548037e-05, "loss": 0.0834, "mean_token_accuracy": 0.9764321863651275, "num_tokens": 1183721.0, "step": 555 }, { "entropy": 2.082006883621216, "epoch": 0.18433179723502305, "grad_norm": 5.6749267578125, "learning_rate": 1.990383872856226e-05, "loss": 0.0805, "mean_token_accuracy": 0.9781780660152435, "num_tokens": 1194206.0, "step": 560 }, { "entropy": 2.0844071984291075, "epoch": 0.1859776168531929, "grad_norm": 5.848039150238037, "learning_rate": 1.9900037084217637e-05, "loss": 0.1125, "mean_token_accuracy": 0.9710779368877411, "num_tokens": 1205005.0, "step": 565 }, { "entropy": 2.138554549217224, "epoch": 0.18762343647136273, "grad_norm": 4.573561668395996, "learning_rate": 1.9896162119669367e-05, "loss": 0.0784, "mean_token_accuracy": 0.978986918926239, "num_tokens": 1215653.0, "step": 570 }, { "entropy": 2.1205518007278443, "epoch": 0.1892692560895326, "grad_norm": 7.720390796661377, "learning_rate": 1.9892213863615634e-05, "loss": 0.1176, "mean_token_accuracy": 0.9739040195941925, "num_tokens": 1226075.0, "step": 575 }, { "entropy": 2.1874123096466063, "epoch": 0.19091507570770244, "grad_norm": 3.979853630065918, "learning_rate": 1.9888192345297438e-05, "loss": 0.092, "mean_token_accuracy": 0.9774543702602386, "num_tokens": 1236488.0, "step": 580 }, { "entropy": 2.240754795074463, "epoch": 0.19256089532587228, "grad_norm": 2.265650510787964, "learning_rate": 1.9884097594498355e-05, "loss": 0.0671, "mean_token_accuracy": 0.9827798247337342, "num_tokens": 1246837.0, "step": 585 }, { "entropy": 2.20197674036026, "epoch": 0.19420671494404212, "grad_norm": 8.468692779541016, "learning_rate": 1.9879929641544328e-05, "loss": 0.0818, "mean_token_accuracy": 0.9744803309440613, "num_tokens": 1257412.0, "step": 590 }, { "entropy": 2.1012311816215514, "epoch": 0.195852534562212, "grad_norm": 4.669355869293213, "learning_rate": 1.9875688517303442e-05, "loss": 0.0844, "mean_token_accuracy": 0.9798676431179046, "num_tokens": 1268044.0, "step": 595 }, { "entropy": 2.048796272277832, "epoch": 0.19749835418038184, "grad_norm": 3.2685132026672363, "learning_rate": 1.987137425318569e-05, "loss": 0.0601, "mean_token_accuracy": 0.9844081580638886, "num_tokens": 1278849.0, "step": 600 }, { "entropy": 2.0971350193023683, "epoch": 0.19914417379855168, "grad_norm": 5.5244317054748535, "learning_rate": 1.9866986881142737e-05, "loss": 0.1095, "mean_token_accuracy": 0.9722402155399322, "num_tokens": 1289463.0, "step": 605 }, { "epoch": 0.20013166556945358, "eval_entropy": 2.089414228537584, "eval_loss": 0.0923554077744484, "eval_mean_token_accuracy": 0.9766228082200635, "eval_num_tokens": 1295891.0, "eval_runtime": 196.5996, "eval_samples_per_second": 42.355, "eval_steps_per_second": 7.06, "step": 608 }, { "entropy": 2.0954473495483397, "epoch": 0.20078999341672152, "grad_norm": 8.191391944885254, "learning_rate": 1.9862526433667702e-05, "loss": 0.1212, "mean_token_accuracy": 0.97202570438385, "num_tokens": 1300049.0, "step": 610 }, { "entropy": 2.0918750286102297, "epoch": 0.2024358130348914, "grad_norm": 5.4046478271484375, "learning_rate": 1.9857992943794894e-05, "loss": 0.0611, "mean_token_accuracy": 0.984895896911621, "num_tokens": 1310575.0, "step": 615 }, { "entropy": 2.0892592668533325, "epoch": 0.20408163265306123, "grad_norm": 4.6009840965271, "learning_rate": 1.9853386445099585e-05, "loss": 0.107, "mean_token_accuracy": 0.9737550795078278, "num_tokens": 1321474.0, "step": 620 }, { "entropy": 2.1745733857154845, "epoch": 0.20572745227123107, "grad_norm": 5.094756603240967, "learning_rate": 1.9848706971697744e-05, "loss": 0.0719, "mean_token_accuracy": 0.9817912340164184, "num_tokens": 1332086.0, "step": 625 }, { "entropy": 2.1878711700439455, "epoch": 0.2073732718894009, "grad_norm": 8.04099178314209, "learning_rate": 1.98439545582458e-05, "loss": 0.1345, "mean_token_accuracy": 0.9673551738262176, "num_tokens": 1342659.0, "step": 630 }, { "entropy": 2.1850104570388793, "epoch": 0.20901909150757078, "grad_norm": 5.618947982788086, "learning_rate": 1.9839129239940392e-05, "loss": 0.1271, "mean_token_accuracy": 0.9722469210624695, "num_tokens": 1353255.0, "step": 635 }, { "entropy": 2.2415198564529417, "epoch": 0.21066491112574062, "grad_norm": 4.641305923461914, "learning_rate": 1.9834231052518074e-05, "loss": 0.0918, "mean_token_accuracy": 0.9772710859775543, "num_tokens": 1363984.0, "step": 640 }, { "entropy": 2.126233756542206, "epoch": 0.21231073074391046, "grad_norm": 6.022129535675049, "learning_rate": 1.9829260032255093e-05, "loss": 0.1311, "mean_token_accuracy": 0.9707091450691223, "num_tokens": 1374522.0, "step": 645 }, { "entropy": 2.136021304130554, "epoch": 0.2139565503620803, "grad_norm": 5.552321910858154, "learning_rate": 1.9824216215967082e-05, "loss": 0.12, "mean_token_accuracy": 0.9679823756217957, "num_tokens": 1385283.0, "step": 650 }, { "entropy": 2.1720472097396852, "epoch": 0.21560236998025017, "grad_norm": 2.7927908897399902, "learning_rate": 1.9819099641008817e-05, "loss": 0.1065, "mean_token_accuracy": 0.9764181435108185, "num_tokens": 1395464.0, "step": 655 }, { "entropy": 2.1303334712982176, "epoch": 0.21724818959842002, "grad_norm": 6.158787250518799, "learning_rate": 1.9813910345273927e-05, "loss": 0.0986, "mean_token_accuracy": 0.9694771349430085, "num_tokens": 1406190.0, "step": 660 }, { "entropy": 2.0735477566719056, "epoch": 0.21889400921658986, "grad_norm": 3.1743812561035156, "learning_rate": 1.9808648367194614e-05, "loss": 0.1286, "mean_token_accuracy": 0.9723605871200561, "num_tokens": 1417233.0, "step": 665 }, { "entropy": 1.9879053115844727, "epoch": 0.2205398288347597, "grad_norm": 4.825967311859131, "learning_rate": 1.980331374574137e-05, "loss": 0.0633, "mean_token_accuracy": 0.9823581516742707, "num_tokens": 1427885.0, "step": 670 }, { "entropy": 1.953224778175354, "epoch": 0.22218564845292957, "grad_norm": 5.118729591369629, "learning_rate": 1.979790652042268e-05, "loss": 0.0851, "mean_token_accuracy": 0.9767945885658265, "num_tokens": 1438245.0, "step": 675 }, { "entropy": 1.881700599193573, "epoch": 0.2238314680710994, "grad_norm": 4.021531105041504, "learning_rate": 1.9792426731284745e-05, "loss": 0.1035, "mean_token_accuracy": 0.9744091331958771, "num_tokens": 1449016.0, "step": 680 }, { "entropy": 1.907604944705963, "epoch": 0.22547728768926925, "grad_norm": 3.6351332664489746, "learning_rate": 1.9786874418911187e-05, "loss": 0.069, "mean_token_accuracy": 0.9822580933570861, "num_tokens": 1459773.0, "step": 685 }, { "entropy": 1.8353791952133178, "epoch": 0.2271231073074391, "grad_norm": 5.1667022705078125, "learning_rate": 1.9781249624422714e-05, "loss": 0.0554, "mean_token_accuracy": 0.9839705526828766, "num_tokens": 1470406.0, "step": 690 }, { "entropy": 1.7690637350082397, "epoch": 0.22876892692560896, "grad_norm": 1.4933720827102661, "learning_rate": 1.9775552389476865e-05, "loss": 0.0929, "mean_token_accuracy": 0.9805386900901795, "num_tokens": 1481079.0, "step": 695 }, { "entropy": 1.7553959727287292, "epoch": 0.2304147465437788, "grad_norm": 7.114426612854004, "learning_rate": 1.976978275626766e-05, "loss": 0.0793, "mean_token_accuracy": 0.9830655574798584, "num_tokens": 1491605.0, "step": 700 }, { "entropy": 1.6573581099510193, "epoch": 0.23206056616194864, "grad_norm": 8.001904487609863, "learning_rate": 1.976394076752531e-05, "loss": 0.0476, "mean_token_accuracy": 0.986677348613739, "num_tokens": 1502176.0, "step": 705 }, { "entropy": 1.6726499915122985, "epoch": 0.2337063857801185, "grad_norm": 2.7234785556793213, "learning_rate": 1.9758026466515902e-05, "loss": 0.0755, "mean_token_accuracy": 0.9787233471870422, "num_tokens": 1512737.0, "step": 710 }, { "entropy": 1.6567459583282471, "epoch": 0.23535220539828836, "grad_norm": 4.0856547355651855, "learning_rate": 1.975203989704106e-05, "loss": 0.0923, "mean_token_accuracy": 0.980503261089325, "num_tokens": 1523545.0, "step": 715 }, { "entropy": 1.7129284858703613, "epoch": 0.2369980250164582, "grad_norm": 5.151584625244141, "learning_rate": 1.9745981103437643e-05, "loss": 0.0799, "mean_token_accuracy": 0.979759806394577, "num_tokens": 1534017.0, "step": 720 }, { "entropy": 1.7500203251838684, "epoch": 0.23864384463462804, "grad_norm": 4.301064968109131, "learning_rate": 1.9739850130577393e-05, "loss": 0.0907, "mean_token_accuracy": 0.9790736913681031, "num_tokens": 1544763.0, "step": 725 }, { "entropy": 1.741828978061676, "epoch": 0.24028966425279788, "grad_norm": 8.185233116149902, "learning_rate": 1.973364702386663e-05, "loss": 0.1146, "mean_token_accuracy": 0.9718895375728607, "num_tokens": 1555495.0, "step": 730 }, { "entropy": 1.706503689289093, "epoch": 0.24193548387096775, "grad_norm": 2.9231204986572266, "learning_rate": 1.972737182924589e-05, "loss": 0.0904, "mean_token_accuracy": 0.9795828342437745, "num_tokens": 1566520.0, "step": 735 }, { "entropy": 1.6863952279090881, "epoch": 0.2435813034891376, "grad_norm": 5.155311584472656, "learning_rate": 1.9721024593189596e-05, "loss": 0.0856, "mean_token_accuracy": 0.9796004176139832, "num_tokens": 1577011.0, "step": 740 }, { "entropy": 1.6566576838493348, "epoch": 0.24522712310730743, "grad_norm": 5.252721309661865, "learning_rate": 1.9714605362705725e-05, "loss": 0.0973, "mean_token_accuracy": 0.9750127613544464, "num_tokens": 1587496.0, "step": 745 }, { "entropy": 1.646758258342743, "epoch": 0.2468729427254773, "grad_norm": 6.778387069702148, "learning_rate": 1.9708114185335434e-05, "loss": 0.0768, "mean_token_accuracy": 0.9819158792495728, "num_tokens": 1598276.0, "step": 750 }, { "entropy": 1.5819305777549744, "epoch": 0.24851876234364714, "grad_norm": 2.566246747970581, "learning_rate": 1.9701551109152732e-05, "loss": 0.0955, "mean_token_accuracy": 0.9769714951515198, "num_tokens": 1609214.0, "step": 755 }, { "entropy": 1.5540341138839722, "epoch": 0.250164581961817, "grad_norm": 7.977406978607178, "learning_rate": 1.9694916182764113e-05, "loss": 0.1004, "mean_token_accuracy": 0.9762414395809174, "num_tokens": 1620096.0, "step": 760 }, { "entropy": 1.6054931163787842, "epoch": 0.25181040157998685, "grad_norm": 3.9608373641967773, "learning_rate": 1.96882094553082e-05, "loss": 0.0981, "mean_token_accuracy": 0.9779304265975952, "num_tokens": 1630872.0, "step": 765 }, { "entropy": 1.6776898503303528, "epoch": 0.2534562211981567, "grad_norm": 5.155510425567627, "learning_rate": 1.9681430976455363e-05, "loss": 0.0898, "mean_token_accuracy": 0.9730437695980072, "num_tokens": 1641601.0, "step": 770 }, { "entropy": 1.8331576704978942, "epoch": 0.25510204081632654, "grad_norm": 3.792520523071289, "learning_rate": 1.9674580796407392e-05, "loss": 0.098, "mean_token_accuracy": 0.9796571433544159, "num_tokens": 1652224.0, "step": 775 }, { "entropy": 1.8529532432556153, "epoch": 0.2567478604344964, "grad_norm": 5.989309310913086, "learning_rate": 1.966765896589708e-05, "loss": 0.1201, "mean_token_accuracy": 0.9693608701229095, "num_tokens": 1662822.0, "step": 780 }, { "entropy": 1.8614219188690186, "epoch": 0.2583936800526662, "grad_norm": 4.173523426055908, "learning_rate": 1.9660665536187875e-05, "loss": 0.092, "mean_token_accuracy": 0.9794945657253266, "num_tokens": 1673483.0, "step": 785 }, { "entropy": 1.8263102531433106, "epoch": 0.26003949967083606, "grad_norm": 3.7985875606536865, "learning_rate": 1.965360055907349e-05, "loss": 0.0998, "mean_token_accuracy": 0.9791225135326386, "num_tokens": 1684086.0, "step": 790 }, { "entropy": 1.8916141629219054, "epoch": 0.2616853192890059, "grad_norm": 5.6334052085876465, "learning_rate": 1.9646464086877524e-05, "loss": 0.0919, "mean_token_accuracy": 0.9774609744548798, "num_tokens": 1694892.0, "step": 795 }, { "entropy": 1.8735905647277833, "epoch": 0.2633311389071758, "grad_norm": 4.0586466789245605, "learning_rate": 1.963925617245307e-05, "loss": 0.0778, "mean_token_accuracy": 0.9765025436878204, "num_tokens": 1705771.0, "step": 800 }, { "entropy": 1.8850264430046082, "epoch": 0.26497695852534564, "grad_norm": 3.566467523574829, "learning_rate": 1.963197686918233e-05, "loss": 0.0735, "mean_token_accuracy": 0.9790683507919311, "num_tokens": 1716121.0, "step": 805 }, { "entropy": 1.8341678500175476, "epoch": 0.2666227781435155, "grad_norm": 3.038248062133789, "learning_rate": 1.9624626230976208e-05, "loss": 0.0674, "mean_token_accuracy": 0.9794325351715087, "num_tokens": 1726679.0, "step": 810 }, { "entropy": 1.8264055967330932, "epoch": 0.2682685977616853, "grad_norm": 7.074533462524414, "learning_rate": 1.961720431227393e-05, "loss": 0.0827, "mean_token_accuracy": 0.9796413004398346, "num_tokens": 1737353.0, "step": 815 }, { "entropy": 1.8825155735015868, "epoch": 0.26991441737985516, "grad_norm": 6.188788890838623, "learning_rate": 1.9609711168042612e-05, "loss": 0.0939, "mean_token_accuracy": 0.9734999477863312, "num_tokens": 1748061.0, "step": 820 }, { "entropy": 1.9580488204956055, "epoch": 0.271560236998025, "grad_norm": 3.9740562438964844, "learning_rate": 1.9602146853776894e-05, "loss": 0.0535, "mean_token_accuracy": 0.9856287181377411, "num_tokens": 1758768.0, "step": 825 }, { "entropy": 1.9614617347717285, "epoch": 0.27320605661619485, "grad_norm": 4.990133285522461, "learning_rate": 1.9594511425498487e-05, "loss": 0.0551, "mean_token_accuracy": 0.9855091035366058, "num_tokens": 1769139.0, "step": 830 }, { "entropy": 1.7971890330314637, "epoch": 0.2748518762343647, "grad_norm": 5.05476713180542, "learning_rate": 1.958680493975578e-05, "loss": 0.0687, "mean_token_accuracy": 0.9821464836597442, "num_tokens": 1780069.0, "step": 835 }, { "entropy": 1.7543489336967468, "epoch": 0.2764976958525346, "grad_norm": 4.147070407867432, "learning_rate": 1.957902745362341e-05, "loss": 0.0743, "mean_token_accuracy": 0.9777528941631317, "num_tokens": 1790802.0, "step": 840 }, { "entropy": 1.6835835099220275, "epoch": 0.2781435154707044, "grad_norm": 7.310057640075684, "learning_rate": 1.957117902470187e-05, "loss": 0.0913, "mean_token_accuracy": 0.9787399888038635, "num_tokens": 1801255.0, "step": 845 }, { "entropy": 1.7269906282424927, "epoch": 0.27978933508887427, "grad_norm": 6.00834321975708, "learning_rate": 1.956325971111703e-05, "loss": 0.1044, "mean_token_accuracy": 0.9713066816329956, "num_tokens": 1811973.0, "step": 850 }, { "entropy": 1.7431592702865601, "epoch": 0.2814351547070441, "grad_norm": 4.275935173034668, "learning_rate": 1.955526957151976e-05, "loss": 0.0821, "mean_token_accuracy": 0.9789995789527893, "num_tokens": 1822550.0, "step": 855 }, { "entropy": 1.7215226411819458, "epoch": 0.28308097432521395, "grad_norm": 2.8283443450927734, "learning_rate": 1.954720866508546e-05, "loss": 0.0592, "mean_token_accuracy": 0.9890708029270172, "num_tokens": 1833405.0, "step": 860 }, { "entropy": 1.7675551056861878, "epoch": 0.2847267939433838, "grad_norm": 4.092453956604004, "learning_rate": 1.9539077051513624e-05, "loss": 0.0992, "mean_token_accuracy": 0.973135906457901, "num_tokens": 1843944.0, "step": 865 }, { "entropy": 1.7665972590446473, "epoch": 0.28637261356155364, "grad_norm": 3.9631991386413574, "learning_rate": 1.9530874791027425e-05, "loss": 0.1248, "mean_token_accuracy": 0.9706246018409729, "num_tokens": 1854547.0, "step": 870 }, { "entropy": 1.7626410603523255, "epoch": 0.2880184331797235, "grad_norm": 5.616950988769531, "learning_rate": 1.952260194437324e-05, "loss": 0.0911, "mean_token_accuracy": 0.9797743022441864, "num_tokens": 1864880.0, "step": 875 }, { "entropy": 1.7034204244613647, "epoch": 0.2896642527978934, "grad_norm": 3.930392265319824, "learning_rate": 1.9514258572820216e-05, "loss": 0.0789, "mean_token_accuracy": 0.9830069661140441, "num_tokens": 1875669.0, "step": 880 }, { "entropy": 1.72216739654541, "epoch": 0.2913100724160632, "grad_norm": 3.14703631401062, "learning_rate": 1.9505844738159807e-05, "loss": 0.0851, "mean_token_accuracy": 0.9783756732940674, "num_tokens": 1886494.0, "step": 885 }, { "entropy": 1.671413540840149, "epoch": 0.29295589203423306, "grad_norm": 4.539898872375488, "learning_rate": 1.949736050270532e-05, "loss": 0.0766, "mean_token_accuracy": 0.9854376316070557, "num_tokens": 1897127.0, "step": 890 }, { "entropy": 1.6798272371292113, "epoch": 0.2946017116524029, "grad_norm": 3.341653347015381, "learning_rate": 1.948880592929146e-05, "loss": 0.0631, "mean_token_accuracy": 0.9869305789470673, "num_tokens": 1907594.0, "step": 895 }, { "entropy": 1.6980504512786865, "epoch": 0.29624753127057274, "grad_norm": 3.1306231021881104, "learning_rate": 1.9480181081273846e-05, "loss": 0.0964, "mean_token_accuracy": 0.9781007289886474, "num_tokens": 1918632.0, "step": 900 }, { "entropy": 1.7191000819206237, "epoch": 0.2978933508887426, "grad_norm": 3.709050416946411, "learning_rate": 1.947148602252858e-05, "loss": 0.0767, "mean_token_accuracy": 0.9797729074954986, "num_tokens": 1929270.0, "step": 905 }, { "entropy": 1.7052234768867494, "epoch": 0.2995391705069124, "grad_norm": 2.6057310104370117, "learning_rate": 1.946272081745171e-05, "loss": 0.0611, "mean_token_accuracy": 0.9820807099342346, "num_tokens": 1940143.0, "step": 910 }, { "entropy": 1.7210368633270263, "epoch": 0.30118499012508226, "grad_norm": 5.367160320281982, "learning_rate": 1.9453885530958835e-05, "loss": 0.0817, "mean_token_accuracy": 0.9767018914222717, "num_tokens": 1951059.0, "step": 915 }, { "entropy": 1.6308549642562866, "epoch": 0.30283080974325216, "grad_norm": 5.0577592849731445, "learning_rate": 1.9444980228484542e-05, "loss": 0.0641, "mean_token_accuracy": 0.9800835251808167, "num_tokens": 1962214.0, "step": 920 }, { "entropy": 1.6711305975914001, "epoch": 0.304476629361422, "grad_norm": 6.653420925140381, "learning_rate": 1.9436004975981986e-05, "loss": 0.108, "mean_token_accuracy": 0.9743327260017395, "num_tokens": 1972789.0, "step": 925 }, { "entropy": 1.679636800289154, "epoch": 0.30612244897959184, "grad_norm": 4.994918346405029, "learning_rate": 1.9426959839922367e-05, "loss": 0.0796, "mean_token_accuracy": 0.9823659062385559, "num_tokens": 1983264.0, "step": 930 }, { "entropy": 1.7234082102775574, "epoch": 0.3077682685977617, "grad_norm": 5.689176082611084, "learning_rate": 1.941784488729444e-05, "loss": 0.0774, "mean_token_accuracy": 0.9792181611061096, "num_tokens": 1993783.0, "step": 935 }, { "entropy": 1.7042930126190186, "epoch": 0.3094140882159315, "grad_norm": 2.4102768898010254, "learning_rate": 1.9408660185604035e-05, "loss": 0.0876, "mean_token_accuracy": 0.9759888887405396, "num_tokens": 2004295.0, "step": 940 }, { "entropy": 1.6130717277526856, "epoch": 0.31105990783410137, "grad_norm": 2.775099277496338, "learning_rate": 1.939940580287354e-05, "loss": 0.0805, "mean_token_accuracy": 0.9795459330081939, "num_tokens": 2015224.0, "step": 945 }, { "entropy": 1.5929541826248168, "epoch": 0.3127057274522712, "grad_norm": 3.403689384460449, "learning_rate": 1.9390081807641413e-05, "loss": 0.0899, "mean_token_accuracy": 0.9769740760326385, "num_tokens": 2025787.0, "step": 950 }, { "entropy": 1.5571842432022094, "epoch": 0.3143515470704411, "grad_norm": 3.0090384483337402, "learning_rate": 1.938068826896166e-05, "loss": 0.06, "mean_token_accuracy": 0.9822336971759796, "num_tokens": 2036384.0, "step": 955 }, { "entropy": 1.5760810375213623, "epoch": 0.31599736668861095, "grad_norm": 2.722409725189209, "learning_rate": 1.9371225256403328e-05, "loss": 0.0888, "mean_token_accuracy": 0.9776059925556183, "num_tokens": 2047090.0, "step": 960 }, { "entropy": 1.6288114428520202, "epoch": 0.3176431863067808, "grad_norm": 2.649474620819092, "learning_rate": 1.9361692840049997e-05, "loss": 0.066, "mean_token_accuracy": 0.979485136270523, "num_tokens": 2057684.0, "step": 965 }, { "entropy": 1.710892951488495, "epoch": 0.31928900592495063, "grad_norm": 2.8936843872070312, "learning_rate": 1.935209109049925e-05, "loss": 0.0862, "mean_token_accuracy": 0.9751926064491272, "num_tokens": 2068108.0, "step": 970 }, { "entropy": 1.7829145908355712, "epoch": 0.32093482554312047, "grad_norm": 2.203496217727661, "learning_rate": 1.9342420078862153e-05, "loss": 0.0902, "mean_token_accuracy": 0.9730205178260803, "num_tokens": 2078727.0, "step": 975 }, { "entropy": 1.6852322578430177, "epoch": 0.3225806451612903, "grad_norm": 2.909604072570801, "learning_rate": 1.933267987676274e-05, "loss": 0.0754, "mean_token_accuracy": 0.9842035174369812, "num_tokens": 2089939.0, "step": 980 }, { "entropy": 1.73391033411026, "epoch": 0.32422646477946016, "grad_norm": 2.8957767486572266, "learning_rate": 1.9322870556337466e-05, "loss": 0.0894, "mean_token_accuracy": 0.9820044219493866, "num_tokens": 2100675.0, "step": 985 }, { "entropy": 1.7834203481674193, "epoch": 0.32587228439763, "grad_norm": 5.757988929748535, "learning_rate": 1.931299219023469e-05, "loss": 0.085, "mean_token_accuracy": 0.977233600616455, "num_tokens": 2111220.0, "step": 990 }, { "entropy": 1.7618568420410157, "epoch": 0.3275181040157999, "grad_norm": 5.181485176086426, "learning_rate": 1.9303044851614106e-05, "loss": 0.0899, "mean_token_accuracy": 0.9746578216552735, "num_tokens": 2121728.0, "step": 995 }, { "entropy": 1.7923735499382019, "epoch": 0.32916392363396973, "grad_norm": 9.63333797454834, "learning_rate": 1.9293028614146246e-05, "loss": 0.1188, "mean_token_accuracy": 0.9688325583934784, "num_tokens": 2132434.0, "step": 1000 }, { "entropy": 1.860191786289215, "epoch": 0.3308097432521396, "grad_norm": 4.145689487457275, "learning_rate": 1.9282943552011892e-05, "loss": 0.0798, "mean_token_accuracy": 0.9769950270652771, "num_tokens": 2142917.0, "step": 1005 }, { "entropy": 1.833854353427887, "epoch": 0.3324555628703094, "grad_norm": 3.677757501602173, "learning_rate": 1.927278973990156e-05, "loss": 0.0828, "mean_token_accuracy": 0.9798845946788788, "num_tokens": 2153639.0, "step": 1010 }, { "entropy": 1.8186659574508668, "epoch": 0.33410138248847926, "grad_norm": 4.619242191314697, "learning_rate": 1.9262567253014922e-05, "loss": 0.0607, "mean_token_accuracy": 0.9790556609630585, "num_tokens": 2164673.0, "step": 1015 }, { "entropy": 1.8815788626670837, "epoch": 0.3357472021066491, "grad_norm": 6.019654273986816, "learning_rate": 1.925227616706026e-05, "loss": 0.0804, "mean_token_accuracy": 0.9770128130912781, "num_tokens": 2175078.0, "step": 1020 }, { "entropy": 1.8578821659088134, "epoch": 0.33739302172481894, "grad_norm": 4.046188831329346, "learning_rate": 1.924191655825391e-05, "loss": 0.0688, "mean_token_accuracy": 0.979873389005661, "num_tokens": 2185679.0, "step": 1025 }, { "entropy": 1.9458543419837953, "epoch": 0.3390388413429888, "grad_norm": 8.663795471191406, "learning_rate": 1.9231488503319687e-05, "loss": 0.1165, "mean_token_accuracy": 0.9754189252853394, "num_tokens": 2196306.0, "step": 1030 }, { "entropy": 1.9784336686134338, "epoch": 0.3406846609611587, "grad_norm": 4.623157501220703, "learning_rate": 1.9220992079488322e-05, "loss": 0.0863, "mean_token_accuracy": 0.9824258208274841, "num_tokens": 2207144.0, "step": 1035 }, { "entropy": 2.0585790038108827, "epoch": 0.3423304805793285, "grad_norm": 5.089199066162109, "learning_rate": 1.9210427364496894e-05, "loss": 0.0813, "mean_token_accuracy": 0.9741666853427887, "num_tokens": 2217699.0, "step": 1040 }, { "entropy": 1.9863484263420106, "epoch": 0.34397630019749836, "grad_norm": 2.4876182079315186, "learning_rate": 1.9199794436588244e-05, "loss": 0.0936, "mean_token_accuracy": 0.9748422205448151, "num_tokens": 2228579.0, "step": 1045 }, { "entropy": 2.034103310108185, "epoch": 0.3456221198156682, "grad_norm": 3.3714184761047363, "learning_rate": 1.9189093374510403e-05, "loss": 0.1052, "mean_token_accuracy": 0.9673223495483398, "num_tokens": 2239194.0, "step": 1050 }, { "entropy": 1.954673457145691, "epoch": 0.34726793943383805, "grad_norm": 3.50931453704834, "learning_rate": 1.917832425751601e-05, "loss": 0.0578, "mean_token_accuracy": 0.9865812182426452, "num_tokens": 2249697.0, "step": 1055 }, { "entropy": 1.8305863380432128, "epoch": 0.3489137590520079, "grad_norm": 5.005392074584961, "learning_rate": 1.9167487165361726e-05, "loss": 0.0775, "mean_token_accuracy": 0.9806217133998871, "num_tokens": 2260788.0, "step": 1060 }, { "entropy": 1.7882127285003662, "epoch": 0.35055957867017773, "grad_norm": 1.8662757873535156, "learning_rate": 1.9156582178307625e-05, "loss": 0.1092, "mean_token_accuracy": 0.9730004489421844, "num_tokens": 2271354.0, "step": 1065 }, { "entropy": 1.7463540315628052, "epoch": 0.35220539828834757, "grad_norm": 7.321222305297852, "learning_rate": 1.9145609377116635e-05, "loss": 0.0659, "mean_token_accuracy": 0.9823782205581665, "num_tokens": 2282285.0, "step": 1070 }, { "entropy": 1.7453363537788391, "epoch": 0.35385121790651747, "grad_norm": 3.2284722328186035, "learning_rate": 1.9134568843053895e-05, "loss": 0.0813, "mean_token_accuracy": 0.9803031504154205, "num_tokens": 2293037.0, "step": 1075 }, { "entropy": 1.76043781042099, "epoch": 0.3554970375246873, "grad_norm": 2.238819122314453, "learning_rate": 1.91234606578862e-05, "loss": 0.0761, "mean_token_accuracy": 0.9841830551624298, "num_tokens": 2303850.0, "step": 1080 }, { "entropy": 1.7409037947654724, "epoch": 0.35714285714285715, "grad_norm": 7.227831840515137, "learning_rate": 1.911228490388136e-05, "loss": 0.0743, "mean_token_accuracy": 0.9812696516513825, "num_tokens": 2314805.0, "step": 1085 }, { "entropy": 1.8239703059196473, "epoch": 0.358788676761027, "grad_norm": 5.955824851989746, "learning_rate": 1.9101041663807606e-05, "loss": 0.0915, "mean_token_accuracy": 0.976921284198761, "num_tokens": 2325296.0, "step": 1090 }, { "entropy": 1.8835476994514466, "epoch": 0.36043449637919683, "grad_norm": 2.8221843242645264, "learning_rate": 1.9089731020932972e-05, "loss": 0.0631, "mean_token_accuracy": 0.9831080317497254, "num_tokens": 2336116.0, "step": 1095 }, { "entropy": 1.9333994030952453, "epoch": 0.3620803159973667, "grad_norm": 6.433844089508057, "learning_rate": 1.907835305902469e-05, "loss": 0.0998, "mean_token_accuracy": 0.9790532469749451, "num_tokens": 2346756.0, "step": 1100 }, { "entropy": 2.0366763949394224, "epoch": 0.3637261356155365, "grad_norm": 13.404642105102539, "learning_rate": 1.906690786234855e-05, "loss": 0.0569, "mean_token_accuracy": 0.9880981385707855, "num_tokens": 2357587.0, "step": 1105 }, { "entropy": 2.0615396976470945, "epoch": 0.36537195523370636, "grad_norm": 4.546095371246338, "learning_rate": 1.9055395515668288e-05, "loss": 0.0903, "mean_token_accuracy": 0.9765397369861603, "num_tokens": 2368379.0, "step": 1110 }, { "entropy": 2.036512088775635, "epoch": 0.36701777485187626, "grad_norm": 4.192592144012451, "learning_rate": 1.9043816104244964e-05, "loss": 0.0779, "mean_token_accuracy": 0.9820389151573181, "num_tokens": 2379490.0, "step": 1115 }, { "entropy": 2.113160729408264, "epoch": 0.3686635944700461, "grad_norm": 4.126070976257324, "learning_rate": 1.9032169713836314e-05, "loss": 0.0884, "mean_token_accuracy": 0.9770908057689667, "num_tokens": 2390060.0, "step": 1120 }, { "entropy": 2.1656644344329834, "epoch": 0.37030941408821594, "grad_norm": 2.7143678665161133, "learning_rate": 1.9020456430696126e-05, "loss": 0.0493, "mean_token_accuracy": 0.9862718224525452, "num_tokens": 2400221.0, "step": 1125 }, { "entropy": 2.1137581944465635, "epoch": 0.3719552337063858, "grad_norm": 4.085227012634277, "learning_rate": 1.9008676341573606e-05, "loss": 0.0521, "mean_token_accuracy": 0.988205760717392, "num_tokens": 2410885.0, "step": 1130 }, { "entropy": 2.044228506088257, "epoch": 0.3736010533245556, "grad_norm": 5.65952205657959, "learning_rate": 1.8996829533712723e-05, "loss": 0.0629, "mean_token_accuracy": 0.984690648317337, "num_tokens": 2421494.0, "step": 1135 }, { "entropy": 2.0729115128517153, "epoch": 0.37524687294272546, "grad_norm": 3.8629510402679443, "learning_rate": 1.898491609485156e-05, "loss": 0.0862, "mean_token_accuracy": 0.9760863423347473, "num_tokens": 2432022.0, "step": 1140 }, { "entropy": 2.1294867634773254, "epoch": 0.3768926925608953, "grad_norm": 4.639437198638916, "learning_rate": 1.8972936113221696e-05, "loss": 0.0759, "mean_token_accuracy": 0.9800122320652008, "num_tokens": 2442748.0, "step": 1145 }, { "entropy": 2.141499698162079, "epoch": 0.3785385121790652, "grad_norm": 3.745588541030884, "learning_rate": 1.8960889677547506e-05, "loss": 0.0843, "mean_token_accuracy": 0.9796210825443268, "num_tokens": 2453552.0, "step": 1150 }, { "entropy": 2.2506140947341917, "epoch": 0.38018433179723504, "grad_norm": 1.9200206995010376, "learning_rate": 1.8948776877045535e-05, "loss": 0.0818, "mean_token_accuracy": 0.9776529312133789, "num_tokens": 2464201.0, "step": 1155 }, { "entropy": 2.2783297538757323, "epoch": 0.3818301514154049, "grad_norm": 3.2144906520843506, "learning_rate": 1.893659780142384e-05, "loss": 0.0748, "mean_token_accuracy": 0.9799034297466278, "num_tokens": 2475109.0, "step": 1160 }, { "entropy": 2.235105013847351, "epoch": 0.3834759710335747, "grad_norm": 11.074976921081543, "learning_rate": 1.8924352540881298e-05, "loss": 0.0759, "mean_token_accuracy": 0.9808390080928803, "num_tokens": 2485624.0, "step": 1165 }, { "entropy": 2.216382932662964, "epoch": 0.38512179065174457, "grad_norm": 5.54982328414917, "learning_rate": 1.891204118610696e-05, "loss": 0.1232, "mean_token_accuracy": 0.9734435498714447, "num_tokens": 2496301.0, "step": 1170 }, { "entropy": 2.362920618057251, "epoch": 0.3867676102699144, "grad_norm": 4.762061595916748, "learning_rate": 1.8899663828279387e-05, "loss": 0.0906, "mean_token_accuracy": 0.9737257599830628, "num_tokens": 2506985.0, "step": 1175 }, { "entropy": 2.3827850580215455, "epoch": 0.38841342988808425, "grad_norm": 4.1375732421875, "learning_rate": 1.8887220559065946e-05, "loss": 0.0868, "mean_token_accuracy": 0.9808934390544891, "num_tokens": 2517845.0, "step": 1180 }, { "entropy": 2.4273977756500242, "epoch": 0.3900592495062541, "grad_norm": 3.6323893070220947, "learning_rate": 1.8874711470622152e-05, "loss": 0.0928, "mean_token_accuracy": 0.9732855081558227, "num_tokens": 2528576.0, "step": 1185 }, { "entropy": 2.3609946250915526, "epoch": 0.391705069124424, "grad_norm": 2.174260377883911, "learning_rate": 1.886213665559099e-05, "loss": 0.0884, "mean_token_accuracy": 0.9792202174663543, "num_tokens": 2539169.0, "step": 1190 }, { "entropy": 2.293043828010559, "epoch": 0.39335088874259383, "grad_norm": 4.491889953613281, "learning_rate": 1.8849496207102204e-05, "loss": 0.0859, "mean_token_accuracy": 0.9761190593242646, "num_tokens": 2549674.0, "step": 1195 }, { "entropy": 2.2694995403289795, "epoch": 0.39499670836076367, "grad_norm": 5.4928812980651855, "learning_rate": 1.8836790218771637e-05, "loss": 0.0828, "mean_token_accuracy": 0.9821516633033752, "num_tokens": 2560364.0, "step": 1200 }, { "entropy": 2.3214454650878906, "epoch": 0.3966425279789335, "grad_norm": 3.3526692390441895, "learning_rate": 1.882401878470052e-05, "loss": 0.0621, "mean_token_accuracy": 0.9838208556175232, "num_tokens": 2570839.0, "step": 1205 }, { "entropy": 2.4305625438690184, "epoch": 0.39828834759710335, "grad_norm": 2.5236058235168457, "learning_rate": 1.8811181999474763e-05, "loss": 0.0772, "mean_token_accuracy": 0.9798738360404968, "num_tokens": 2581459.0, "step": 1210 }, { "entropy": 2.4986673831939696, "epoch": 0.3999341672152732, "grad_norm": 5.663600921630859, "learning_rate": 1.8798279958164295e-05, "loss": 0.0663, "mean_token_accuracy": 0.9851954817771912, "num_tokens": 2591722.0, "step": 1215 }, { "epoch": 0.40026333113890716, "eval_entropy": 2.455367434179405, "eval_loss": 0.07768969982862473, "eval_mean_token_accuracy": 0.9805304530889912, "eval_num_tokens": 2593845.0, "eval_runtime": 196.539, "eval_samples_per_second": 42.368, "eval_steps_per_second": 7.062, "step": 1216 }, { "entropy": 2.4370908737182617, "epoch": 0.40157998683344304, "grad_norm": 3.0529227256774902, "learning_rate": 1.878531275632232e-05, "loss": 0.1109, "mean_token_accuracy": 0.9730750441551208, "num_tokens": 2602413.0, "step": 1220 }, { "entropy": 2.4589828968048097, "epoch": 0.4032258064516129, "grad_norm": 7.966182231903076, "learning_rate": 1.8772280489984628e-05, "loss": 0.0739, "mean_token_accuracy": 0.9801295638084412, "num_tokens": 2613250.0, "step": 1225 }, { "entropy": 2.4703043937683105, "epoch": 0.4048716260697828, "grad_norm": 4.381008148193359, "learning_rate": 1.875918325566888e-05, "loss": 0.0812, "mean_token_accuracy": 0.9783048331737518, "num_tokens": 2624053.0, "step": 1230 }, { "entropy": 2.4899261713027956, "epoch": 0.4065174456879526, "grad_norm": 4.341674327850342, "learning_rate": 1.8746021150373892e-05, "loss": 0.0768, "mean_token_accuracy": 0.9822542846202851, "num_tokens": 2634584.0, "step": 1235 }, { "entropy": 2.5025168657302856, "epoch": 0.40816326530612246, "grad_norm": 4.821120262145996, "learning_rate": 1.873279427157892e-05, "loss": 0.0649, "mean_token_accuracy": 0.9847510397434235, "num_tokens": 2645173.0, "step": 1240 }, { "entropy": 2.5438794136047362, "epoch": 0.4098090849242923, "grad_norm": 4.269495010375977, "learning_rate": 1.8719502717242937e-05, "loss": 0.0801, "mean_token_accuracy": 0.9804035782814026, "num_tokens": 2655404.0, "step": 1245 }, { "entropy": 2.467151665687561, "epoch": 0.41145490454246214, "grad_norm": 2.994798421859741, "learning_rate": 1.8706146585803903e-05, "loss": 0.0676, "mean_token_accuracy": 0.9815965473651886, "num_tokens": 2666063.0, "step": 1250 }, { "entropy": 2.3562156200408935, "epoch": 0.413100724160632, "grad_norm": 4.536716461181641, "learning_rate": 1.8692725976178038e-05, "loss": 0.0718, "mean_token_accuracy": 0.9820040583610534, "num_tokens": 2676929.0, "step": 1255 }, { "entropy": 2.3580038785934447, "epoch": 0.4147465437788018, "grad_norm": 3.2308404445648193, "learning_rate": 1.8679240987759098e-05, "loss": 0.0654, "mean_token_accuracy": 0.9846524596214294, "num_tokens": 2687554.0, "step": 1260 }, { "entropy": 2.24311740398407, "epoch": 0.41639236339697167, "grad_norm": 3.5214755535125732, "learning_rate": 1.8665691720417624e-05, "loss": 0.0644, "mean_token_accuracy": 0.9793017745018006, "num_tokens": 2698138.0, "step": 1265 }, { "entropy": 2.195026898384094, "epoch": 0.41803818301514156, "grad_norm": 2.515350818634033, "learning_rate": 1.865207827450022e-05, "loss": 0.0624, "mean_token_accuracy": 0.9875331342220306, "num_tokens": 2708644.0, "step": 1270 }, { "entropy": 2.1811726808547975, "epoch": 0.4196840026333114, "grad_norm": 3.122403144836426, "learning_rate": 1.8638400750828793e-05, "loss": 0.0525, "mean_token_accuracy": 0.9861293792724609, "num_tokens": 2719229.0, "step": 1275 }, { "entropy": 2.1384164810180666, "epoch": 0.42132982225148125, "grad_norm": 3.760925531387329, "learning_rate": 1.8624659250699807e-05, "loss": 0.1054, "mean_token_accuracy": 0.973668885231018, "num_tokens": 2729859.0, "step": 1280 }, { "entropy": 2.136362624168396, "epoch": 0.4229756418696511, "grad_norm": 3.5055840015411377, "learning_rate": 1.8610853875883553e-05, "loss": 0.0707, "mean_token_accuracy": 0.980605673789978, "num_tokens": 2740393.0, "step": 1285 }, { "entropy": 2.1360629320144655, "epoch": 0.42462146148782093, "grad_norm": 5.643499851226807, "learning_rate": 1.8596984728623374e-05, "loss": 0.0891, "mean_token_accuracy": 0.9784270226955414, "num_tokens": 2750842.0, "step": 1290 }, { "entropy": 2.1816317081451415, "epoch": 0.42626728110599077, "grad_norm": 2.8327138423919678, "learning_rate": 1.858305191163491e-05, "loss": 0.0915, "mean_token_accuracy": 0.9775237262248992, "num_tokens": 2761249.0, "step": 1295 }, { "entropy": 2.2412596225738524, "epoch": 0.4279131007241606, "grad_norm": 1.0889869928359985, "learning_rate": 1.8569055528105356e-05, "loss": 0.0952, "mean_token_accuracy": 0.9791838228702545, "num_tokens": 2771715.0, "step": 1300 }, { "entropy": 2.200755214691162, "epoch": 0.4295589203423305, "grad_norm": 2.3900723457336426, "learning_rate": 1.855499568169267e-05, "loss": 0.0761, "mean_token_accuracy": 0.9767322957515716, "num_tokens": 2782449.0, "step": 1305 }, { "entropy": 2.1307253360748293, "epoch": 0.43120473996050035, "grad_norm": 3.4063174724578857, "learning_rate": 1.854087247652483e-05, "loss": 0.0675, "mean_token_accuracy": 0.9809297919273376, "num_tokens": 2792843.0, "step": 1310 }, { "entropy": 2.0838756799697875, "epoch": 0.4328505595786702, "grad_norm": 2.3350863456726074, "learning_rate": 1.8526686017199046e-05, "loss": 0.0859, "mean_token_accuracy": 0.9777861475944519, "num_tokens": 2803434.0, "step": 1315 }, { "entropy": 2.061713254451752, "epoch": 0.43449637919684003, "grad_norm": 3.2840187549591064, "learning_rate": 1.8512436408780995e-05, "loss": 0.0994, "mean_token_accuracy": 0.979932302236557, "num_tokens": 2814246.0, "step": 1320 }, { "entropy": 2.1339207768440245, "epoch": 0.4361421988150099, "grad_norm": 3.665444850921631, "learning_rate": 1.8498123756804038e-05, "loss": 0.0704, "mean_token_accuracy": 0.9794367909431457, "num_tokens": 2825017.0, "step": 1325 }, { "entropy": 2.1423274278640747, "epoch": 0.4377880184331797, "grad_norm": 4.194466590881348, "learning_rate": 1.848374816726844e-05, "loss": 0.0636, "mean_token_accuracy": 0.9790065705776214, "num_tokens": 2835765.0, "step": 1330 }, { "entropy": 2.220040726661682, "epoch": 0.43943383805134956, "grad_norm": 2.7448790073394775, "learning_rate": 1.8469309746640587e-05, "loss": 0.0651, "mean_token_accuracy": 0.9824341356754303, "num_tokens": 2846294.0, "step": 1335 }, { "entropy": 2.1996994495391844, "epoch": 0.4410796576695194, "grad_norm": 3.1929495334625244, "learning_rate": 1.845480860185219e-05, "loss": 0.0431, "mean_token_accuracy": 0.9881280601024628, "num_tokens": 2856892.0, "step": 1340 }, { "entropy": 2.0707924485206606, "epoch": 0.4427254772876893, "grad_norm": 7.250471115112305, "learning_rate": 1.8440244840299507e-05, "loss": 0.0697, "mean_token_accuracy": 0.983168751001358, "num_tokens": 2867704.0, "step": 1345 }, { "entropy": 1.9964185953140259, "epoch": 0.44437129690585914, "grad_norm": 5.546008586883545, "learning_rate": 1.8425618569842528e-05, "loss": 0.0749, "mean_token_accuracy": 0.9826294600963592, "num_tokens": 2878605.0, "step": 1350 }, { "entropy": 2.0520959854125977, "epoch": 0.446017116524029, "grad_norm": 2.6965854167938232, "learning_rate": 1.8410929898804197e-05, "loss": 0.0748, "mean_token_accuracy": 0.983626252412796, "num_tokens": 2889180.0, "step": 1355 }, { "entropy": 2.033618211746216, "epoch": 0.4476629361421988, "grad_norm": 6.2912116050720215, "learning_rate": 1.83961789359696e-05, "loss": 0.0658, "mean_token_accuracy": 0.9775195240974426, "num_tokens": 2899792.0, "step": 1360 }, { "entropy": 2.0571701526641846, "epoch": 0.44930875576036866, "grad_norm": 5.728684425354004, "learning_rate": 1.838136579058515e-05, "loss": 0.1018, "mean_token_accuracy": 0.9756032049655914, "num_tokens": 2910450.0, "step": 1365 }, { "entropy": 2.135818696022034, "epoch": 0.4509545753785385, "grad_norm": 2.3949079513549805, "learning_rate": 1.8366490572357798e-05, "loss": 0.0688, "mean_token_accuracy": 0.9789416313171386, "num_tokens": 2921065.0, "step": 1370 }, { "entropy": 2.213190221786499, "epoch": 0.45260039499670834, "grad_norm": 3.1379809379577637, "learning_rate": 1.8351553391454203e-05, "loss": 0.0926, "mean_token_accuracy": 0.9748626470565795, "num_tokens": 2931783.0, "step": 1375 }, { "entropy": 2.259020519256592, "epoch": 0.4542462146148782, "grad_norm": 6.139921188354492, "learning_rate": 1.8336554358499923e-05, "loss": 0.0883, "mean_token_accuracy": 0.9798992872238159, "num_tokens": 2942729.0, "step": 1380 }, { "entropy": 2.2399872303009034, "epoch": 0.4558920342330481, "grad_norm": 4.462457180023193, "learning_rate": 1.83214935845786e-05, "loss": 0.0942, "mean_token_accuracy": 0.9766364514827728, "num_tokens": 2953444.0, "step": 1385 }, { "entropy": 2.3818634748458862, "epoch": 0.4575378538512179, "grad_norm": 2.3201687335968018, "learning_rate": 1.830637118123113e-05, "loss": 0.06, "mean_token_accuracy": 0.987183290719986, "num_tokens": 2963968.0, "step": 1390 }, { "entropy": 2.319228458404541, "epoch": 0.45918367346938777, "grad_norm": 1.5512436628341675, "learning_rate": 1.8291187260454842e-05, "loss": 0.0429, "mean_token_accuracy": 0.9886379659175872, "num_tokens": 2974544.0, "step": 1395 }, { "entropy": 2.212476873397827, "epoch": 0.4608294930875576, "grad_norm": 3.864231586456299, "learning_rate": 1.827594193470266e-05, "loss": 0.0719, "mean_token_accuracy": 0.9817937731742858, "num_tokens": 2985035.0, "step": 1400 }, { "entropy": 2.160155749320984, "epoch": 0.46247531270572745, "grad_norm": 4.27418851852417, "learning_rate": 1.8260635316882288e-05, "loss": 0.0752, "mean_token_accuracy": 0.9822825253009796, "num_tokens": 2995814.0, "step": 1405 }, { "entropy": 2.1970561265945436, "epoch": 0.4641211323238973, "grad_norm": 7.332748889923096, "learning_rate": 1.8245267520355348e-05, "loss": 0.0787, "mean_token_accuracy": 0.9827774345874787, "num_tokens": 3006260.0, "step": 1410 }, { "entropy": 2.339077877998352, "epoch": 0.46576695194206713, "grad_norm": 1.8340346813201904, "learning_rate": 1.8229838658936566e-05, "loss": 0.0763, "mean_token_accuracy": 0.980904471874237, "num_tokens": 3016932.0, "step": 1415 }, { "entropy": 2.3882060766220095, "epoch": 0.467412771560237, "grad_norm": 2.010164737701416, "learning_rate": 1.8214348846892913e-05, "loss": 0.0488, "mean_token_accuracy": 0.9874413549900055, "num_tokens": 3027677.0, "step": 1420 }, { "entropy": 2.291885328292847, "epoch": 0.46905859117840687, "grad_norm": 2.1564724445343018, "learning_rate": 1.8198798198942768e-05, "loss": 0.0702, "mean_token_accuracy": 0.9820050477981568, "num_tokens": 3038413.0, "step": 1425 }, { "entropy": 2.285910177230835, "epoch": 0.4707044107965767, "grad_norm": 3.993086576461792, "learning_rate": 1.8183186830255058e-05, "loss": 0.0707, "mean_token_accuracy": 0.9839109480381012, "num_tokens": 3049009.0, "step": 1430 }, { "entropy": 2.243725609779358, "epoch": 0.47235023041474655, "grad_norm": 3.17380952835083, "learning_rate": 1.8167514856448413e-05, "loss": 0.0509, "mean_token_accuracy": 0.9843453109264374, "num_tokens": 3059423.0, "step": 1435 }, { "entropy": 2.16868679523468, "epoch": 0.4739960500329164, "grad_norm": 2.891127824783325, "learning_rate": 1.815178239359031e-05, "loss": 0.0586, "mean_token_accuracy": 0.9830320298671722, "num_tokens": 3070009.0, "step": 1440 }, { "entropy": 2.055533969402313, "epoch": 0.47564186965108624, "grad_norm": 4.964085578918457, "learning_rate": 1.8135989558196207e-05, "loss": 0.039, "mean_token_accuracy": 0.9861751973628998, "num_tokens": 3080665.0, "step": 1445 }, { "entropy": 2.027339553833008, "epoch": 0.4772876892692561, "grad_norm": 7.009729862213135, "learning_rate": 1.812013646722869e-05, "loss": 0.096, "mean_token_accuracy": 0.9795560419559479, "num_tokens": 3090979.0, "step": 1450 }, { "entropy": 2.1618866443634035, "epoch": 0.4789335088874259, "grad_norm": 3.8036680221557617, "learning_rate": 1.8104223238096596e-05, "loss": 0.0902, "mean_token_accuracy": 0.9777205526828766, "num_tokens": 3101297.0, "step": 1455 }, { "entropy": 2.24229896068573, "epoch": 0.48057932850559576, "grad_norm": 2.5701446533203125, "learning_rate": 1.808824998865415e-05, "loss": 0.0397, "mean_token_accuracy": 0.9876787722110748, "num_tokens": 3111894.0, "step": 1460 }, { "entropy": 2.251754140853882, "epoch": 0.48222514812376566, "grad_norm": 5.5385847091674805, "learning_rate": 1.8072216837200094e-05, "loss": 0.0769, "mean_token_accuracy": 0.9802028417587281, "num_tokens": 3122674.0, "step": 1465 }, { "entropy": 2.185230827331543, "epoch": 0.4838709677419355, "grad_norm": 4.962352752685547, "learning_rate": 1.80561239024768e-05, "loss": 0.0529, "mean_token_accuracy": 0.9858423829078674, "num_tokens": 3133175.0, "step": 1470 }, { "entropy": 2.1357677221298217, "epoch": 0.48551678736010534, "grad_norm": 6.6793389320373535, "learning_rate": 1.8039971303669407e-05, "loss": 0.0698, "mean_token_accuracy": 0.983429902791977, "num_tokens": 3143713.0, "step": 1475 }, { "entropy": 2.1006932973861696, "epoch": 0.4871626069782752, "grad_norm": 1.9220439195632935, "learning_rate": 1.8023759160404923e-05, "loss": 0.0704, "mean_token_accuracy": 0.9849179744720459, "num_tokens": 3154604.0, "step": 1480 }, { "entropy": 2.1457018852233887, "epoch": 0.488808426596445, "grad_norm": 2.7263882160186768, "learning_rate": 1.8007487592751343e-05, "loss": 0.0578, "mean_token_accuracy": 0.9825065851211547, "num_tokens": 3165594.0, "step": 1485 }, { "entropy": 2.2146376371383667, "epoch": 0.49045424621461486, "grad_norm": 4.0069804191589355, "learning_rate": 1.799115672121677e-05, "loss": 0.0591, "mean_token_accuracy": 0.9832662463188171, "num_tokens": 3176301.0, "step": 1490 }, { "entropy": 2.2213491678237913, "epoch": 0.4921000658327847, "grad_norm": 2.8298332691192627, "learning_rate": 1.7974766666748516e-05, "loss": 0.0803, "mean_token_accuracy": 0.9778595089912414, "num_tokens": 3186898.0, "step": 1495 }, { "entropy": 2.203892719745636, "epoch": 0.4937458854509546, "grad_norm": 2.422091484069824, "learning_rate": 1.7958317550732193e-05, "loss": 0.0591, "mean_token_accuracy": 0.9813643217086792, "num_tokens": 3197716.0, "step": 1500 }, { "entropy": 2.1539814710617065, "epoch": 0.49539170506912444, "grad_norm": 1.6230992078781128, "learning_rate": 1.7941809494990838e-05, "loss": 0.0702, "mean_token_accuracy": 0.9812954783439636, "num_tokens": 3208681.0, "step": 1505 }, { "entropy": 2.0968565344810486, "epoch": 0.4970375246872943, "grad_norm": 4.276401042938232, "learning_rate": 1.792524262178399e-05, "loss": 0.0957, "mean_token_accuracy": 0.9712316334247589, "num_tokens": 3219313.0, "step": 1510 }, { "entropy": 1.9671455860137939, "epoch": 0.4986833443054641, "grad_norm": 2.3620660305023193, "learning_rate": 1.7908617053806802e-05, "loss": 0.0548, "mean_token_accuracy": 0.9848778009414673, "num_tokens": 3229908.0, "step": 1515 }, { "entropy": 1.9334992289543151, "epoch": 0.500329163923634, "grad_norm": 2.5839757919311523, "learning_rate": 1.7891932914189112e-05, "loss": 0.0542, "mean_token_accuracy": 0.9889938116073609, "num_tokens": 3240444.0, "step": 1520 }, { "entropy": 1.9328988552093507, "epoch": 0.5019749835418038, "grad_norm": 2.2053468227386475, "learning_rate": 1.7875190326494552e-05, "loss": 0.0706, "mean_token_accuracy": 0.9844483017921448, "num_tokens": 3251129.0, "step": 1525 }, { "entropy": 1.900172483921051, "epoch": 0.5036208031599737, "grad_norm": 3.773327112197876, "learning_rate": 1.7858389414719628e-05, "loss": 0.0567, "mean_token_accuracy": 0.9843137383460998, "num_tokens": 3261867.0, "step": 1530 }, { "entropy": 1.9451073169708253, "epoch": 0.5052666227781435, "grad_norm": 3.007178544998169, "learning_rate": 1.7841530303292782e-05, "loss": 0.0478, "mean_token_accuracy": 0.9842455625534058, "num_tokens": 3272277.0, "step": 1535 }, { "entropy": 1.9109816670417785, "epoch": 0.5069124423963134, "grad_norm": 6.694243431091309, "learning_rate": 1.78246131170735e-05, "loss": 0.0934, "mean_token_accuracy": 0.9757687628269196, "num_tokens": 3282740.0, "step": 1540 }, { "entropy": 1.945954716205597, "epoch": 0.5085582620144832, "grad_norm": 4.0865936279296875, "learning_rate": 1.780763798135136e-05, "loss": 0.055, "mean_token_accuracy": 0.9839641332626343, "num_tokens": 3293583.0, "step": 1545 }, { "entropy": 1.8939694523811341, "epoch": 0.5102040816326531, "grad_norm": 5.349853038787842, "learning_rate": 1.779060502184513e-05, "loss": 0.093, "mean_token_accuracy": 0.9804658055305481, "num_tokens": 3304579.0, "step": 1550 }, { "entropy": 1.9622906923294068, "epoch": 0.5118499012508229, "grad_norm": 4.101441860198975, "learning_rate": 1.777351436470182e-05, "loss": 0.0978, "mean_token_accuracy": 0.9735360503196716, "num_tokens": 3315316.0, "step": 1555 }, { "entropy": 2.1062827825546266, "epoch": 0.5134957208689928, "grad_norm": 2.2984564304351807, "learning_rate": 1.775636613649574e-05, "loss": 0.0659, "mean_token_accuracy": 0.9824669599533081, "num_tokens": 3325816.0, "step": 1560 }, { "entropy": 2.0575469732284546, "epoch": 0.5151415404871627, "grad_norm": 3.7546675205230713, "learning_rate": 1.7739160464227593e-05, "loss": 0.0902, "mean_token_accuracy": 0.9760908842086792, "num_tokens": 3336506.0, "step": 1565 }, { "entropy": 1.938406229019165, "epoch": 0.5167873601053324, "grad_norm": 2.3365416526794434, "learning_rate": 1.7721897475323508e-05, "loss": 0.084, "mean_token_accuracy": 0.9817908525466919, "num_tokens": 3346922.0, "step": 1570 }, { "entropy": 1.9214160919189454, "epoch": 0.5184331797235023, "grad_norm": 3.7224996089935303, "learning_rate": 1.7704577297634096e-05, "loss": 0.0595, "mean_token_accuracy": 0.9839179992675782, "num_tokens": 3357419.0, "step": 1575 }, { "entropy": 1.8380493998527527, "epoch": 0.5200789993416721, "grad_norm": 4.267509937286377, "learning_rate": 1.768720005943353e-05, "loss": 0.087, "mean_token_accuracy": 0.9799594342708587, "num_tokens": 3368205.0, "step": 1580 }, { "entropy": 1.8690474152565002, "epoch": 0.521724818959842, "grad_norm": 2.147115468978882, "learning_rate": 1.7669765889418553e-05, "loss": 0.1006, "mean_token_accuracy": 0.9794728994369507, "num_tokens": 3378592.0, "step": 1585 }, { "entropy": 1.880600130558014, "epoch": 0.5233706385780118, "grad_norm": 3.2792811393737793, "learning_rate": 1.7652274916707566e-05, "loss": 0.0709, "mean_token_accuracy": 0.983178836107254, "num_tokens": 3389050.0, "step": 1590 }, { "entropy": 1.9782520651817321, "epoch": 0.5250164581961817, "grad_norm": 2.2735812664031982, "learning_rate": 1.7634727270839645e-05, "loss": 0.0628, "mean_token_accuracy": 0.9815650939941406, "num_tokens": 3399987.0, "step": 1595 }, { "entropy": 2.048723006248474, "epoch": 0.5266622778143516, "grad_norm": 6.50067138671875, "learning_rate": 1.761712308177359e-05, "loss": 0.1481, "mean_token_accuracy": 0.959316509962082, "num_tokens": 3410677.0, "step": 1600 }, { "entropy": 2.094959032535553, "epoch": 0.5283080974325214, "grad_norm": 7.985395908355713, "learning_rate": 1.7599462479886976e-05, "loss": 0.0756, "mean_token_accuracy": 0.9803730607032776, "num_tokens": 3421071.0, "step": 1605 }, { "entropy": 2.046134078502655, "epoch": 0.5299539170506913, "grad_norm": 5.17471981048584, "learning_rate": 1.7581745595975158e-05, "loss": 0.0956, "mean_token_accuracy": 0.9822999238967896, "num_tokens": 3432117.0, "step": 1610 }, { "entropy": 2.171485185623169, "epoch": 0.5315997366688611, "grad_norm": 1.3401827812194824, "learning_rate": 1.7563972561250323e-05, "loss": 0.0513, "mean_token_accuracy": 0.9851057410240174, "num_tokens": 3442596.0, "step": 1615 }, { "entropy": 2.094390308856964, "epoch": 0.533245556287031, "grad_norm": 5.245898246765137, "learning_rate": 1.7546143507340517e-05, "loss": 0.068, "mean_token_accuracy": 0.9802603662014008, "num_tokens": 3453151.0, "step": 1620 }, { "entropy": 2.0529758810997008, "epoch": 0.5348913759052007, "grad_norm": 2.95365309715271, "learning_rate": 1.7528258566288666e-05, "loss": 0.0872, "mean_token_accuracy": 0.9790965497493744, "num_tokens": 3463685.0, "step": 1625 }, { "entropy": 2.0168527364730835, "epoch": 0.5365371955233706, "grad_norm": 3.4701573848724365, "learning_rate": 1.75103178705516e-05, "loss": 0.0809, "mean_token_accuracy": 0.979897940158844, "num_tokens": 3474671.0, "step": 1630 }, { "entropy": 2.0435917496681215, "epoch": 0.5381830151415404, "grad_norm": 2.95245361328125, "learning_rate": 1.7492321552999076e-05, "loss": 0.0423, "mean_token_accuracy": 0.9878913462162018, "num_tokens": 3485155.0, "step": 1635 }, { "entropy": 1.9816929221153259, "epoch": 0.5398288347597103, "grad_norm": 3.2681093215942383, "learning_rate": 1.747426974691277e-05, "loss": 0.053, "mean_token_accuracy": 0.9853954315185547, "num_tokens": 3495688.0, "step": 1640 }, { "entropy": 1.8952534437179565, "epoch": 0.5414746543778802, "grad_norm": 3.5420217514038086, "learning_rate": 1.7456162585985335e-05, "loss": 0.055, "mean_token_accuracy": 0.9834610342979431, "num_tokens": 3506316.0, "step": 1645 }, { "entropy": 1.9181370496749879, "epoch": 0.54312047399605, "grad_norm": 2.8061180114746094, "learning_rate": 1.7438000204319365e-05, "loss": 0.0615, "mean_token_accuracy": 0.9796540081501007, "num_tokens": 3517096.0, "step": 1650 }, { "entropy": 1.900493335723877, "epoch": 0.5447662936142199, "grad_norm": 6.022247791290283, "learning_rate": 1.7419782736426433e-05, "loss": 0.0686, "mean_token_accuracy": 0.9824603438377381, "num_tokens": 3527608.0, "step": 1655 }, { "entropy": 1.9555213689804076, "epoch": 0.5464121132323897, "grad_norm": 2.4826130867004395, "learning_rate": 1.7401510317226077e-05, "loss": 0.0602, "mean_token_accuracy": 0.9860040962696075, "num_tokens": 3538016.0, "step": 1660 }, { "entropy": 1.9975322604179382, "epoch": 0.5480579328505596, "grad_norm": 1.6209336519241333, "learning_rate": 1.7383183082044814e-05, "loss": 0.0632, "mean_token_accuracy": 0.9838661909103393, "num_tokens": 3548608.0, "step": 1665 }, { "entropy": 2.0439995765686034, "epoch": 0.5497037524687294, "grad_norm": 2.632179021835327, "learning_rate": 1.7364801166615124e-05, "loss": 0.0615, "mean_token_accuracy": 0.9858087658882141, "num_tokens": 3558846.0, "step": 1670 }, { "entropy": 2.084509313106537, "epoch": 0.5513495720868993, "grad_norm": 2.9909346103668213, "learning_rate": 1.7346364707074453e-05, "loss": 0.045, "mean_token_accuracy": 0.9885495722293853, "num_tokens": 3569553.0, "step": 1675 }, { "entropy": 2.1361532688140867, "epoch": 0.5529953917050692, "grad_norm": 2.4530577659606934, "learning_rate": 1.732787383996421e-05, "loss": 0.0783, "mean_token_accuracy": 0.9758898675441742, "num_tokens": 3580282.0, "step": 1680 }, { "entropy": 2.134485626220703, "epoch": 0.554641211323239, "grad_norm": 2.1198272705078125, "learning_rate": 1.7309328702228742e-05, "loss": 0.0757, "mean_token_accuracy": 0.9795774221420288, "num_tokens": 3591130.0, "step": 1685 }, { "entropy": 2.2107550859451295, "epoch": 0.5562870309414089, "grad_norm": 3.947378158569336, "learning_rate": 1.729072943121433e-05, "loss": 0.1107, "mean_token_accuracy": 0.9773122131824493, "num_tokens": 3601615.0, "step": 1690 }, { "entropy": 2.156628942489624, "epoch": 0.5579328505595786, "grad_norm": 5.117584228515625, "learning_rate": 1.727207616466817e-05, "loss": 0.0573, "mean_token_accuracy": 0.9852468788623809, "num_tokens": 3612527.0, "step": 1695 }, { "entropy": 2.1495004177093504, "epoch": 0.5595786701777485, "grad_norm": 3.0924577713012695, "learning_rate": 1.725336904073735e-05, "loss": 0.0499, "mean_token_accuracy": 0.9855070650577545, "num_tokens": 3623196.0, "step": 1700 }, { "entropy": 2.139863908290863, "epoch": 0.5612244897959183, "grad_norm": 2.3820176124572754, "learning_rate": 1.723460819796783e-05, "loss": 0.0439, "mean_token_accuracy": 0.9874242305755615, "num_tokens": 3634081.0, "step": 1705 }, { "entropy": 2.0737234473228456, "epoch": 0.5628703094140882, "grad_norm": 4.342321395874023, "learning_rate": 1.7215793775303415e-05, "loss": 0.0857, "mean_token_accuracy": 0.9764466941356659, "num_tokens": 3644499.0, "step": 1710 }, { "entropy": 2.0394096612930297, "epoch": 0.5645161290322581, "grad_norm": 2.6576485633850098, "learning_rate": 1.719692591208472e-05, "loss": 0.0679, "mean_token_accuracy": 0.9818757474422455, "num_tokens": 3655219.0, "step": 1715 }, { "entropy": 2.023374152183533, "epoch": 0.5661619486504279, "grad_norm": 1.6631029844284058, "learning_rate": 1.7178004748048157e-05, "loss": 0.0681, "mean_token_accuracy": 0.9818334817886353, "num_tokens": 3665785.0, "step": 1720 }, { "entropy": 2.0491710901260376, "epoch": 0.5678077682685978, "grad_norm": 1.3181400299072266, "learning_rate": 1.7159030423324873e-05, "loss": 0.0395, "mean_token_accuracy": 0.9904541313648224, "num_tokens": 3676536.0, "step": 1725 }, { "entropy": 1.9250925421714782, "epoch": 0.5694535878867676, "grad_norm": 3.0907676219940186, "learning_rate": 1.7140003078439727e-05, "loss": 0.0726, "mean_token_accuracy": 0.9785450220108032, "num_tokens": 3687519.0, "step": 1730 }, { "entropy": 1.9846270561218262, "epoch": 0.5710994075049375, "grad_norm": 2.5993564128875732, "learning_rate": 1.712092285431026e-05, "loss": 0.0729, "mean_token_accuracy": 0.9780683517456055, "num_tokens": 3698184.0, "step": 1735 }, { "entropy": 1.9407028794288634, "epoch": 0.5727452271231073, "grad_norm": 2.01981520652771, "learning_rate": 1.710178989224562e-05, "loss": 0.0693, "mean_token_accuracy": 0.9816851794719696, "num_tokens": 3709017.0, "step": 1740 }, { "entropy": 1.9194607377052306, "epoch": 0.5743910467412772, "grad_norm": 2.252833843231201, "learning_rate": 1.7082604333945557e-05, "loss": 0.0855, "mean_token_accuracy": 0.9747888624668122, "num_tokens": 3719609.0, "step": 1745 }, { "entropy": 1.9611817717552185, "epoch": 0.576036866359447, "grad_norm": 6.348813056945801, "learning_rate": 1.7063366321499338e-05, "loss": 0.0764, "mean_token_accuracy": 0.9813452184200286, "num_tokens": 3730007.0, "step": 1750 }, { "entropy": 1.9477728486061097, "epoch": 0.5776826859776168, "grad_norm": 2.917712926864624, "learning_rate": 1.7044075997384707e-05, "loss": 0.0559, "mean_token_accuracy": 0.9842540562152863, "num_tokens": 3740830.0, "step": 1755 }, { "entropy": 2.0349454164505003, "epoch": 0.5793285055957867, "grad_norm": 2.951350212097168, "learning_rate": 1.7024733504466843e-05, "loss": 0.0465, "mean_token_accuracy": 0.9853815793991089, "num_tokens": 3751273.0, "step": 1760 }, { "entropy": 2.0125353813171385, "epoch": 0.5809743252139565, "grad_norm": 2.382908582687378, "learning_rate": 1.7005338985997273e-05, "loss": 0.0593, "mean_token_accuracy": 0.9872986257076264, "num_tokens": 3761728.0, "step": 1765 }, { "entropy": 2.04194176197052, "epoch": 0.5826201448321264, "grad_norm": 5.703618049621582, "learning_rate": 1.6985892585612848e-05, "loss": 0.1025, "mean_token_accuracy": 0.9684024691581726, "num_tokens": 3772794.0, "step": 1770 }, { "entropy": 2.026109528541565, "epoch": 0.5842659644502962, "grad_norm": 2.672579288482666, "learning_rate": 1.6966394447334638e-05, "loss": 0.0634, "mean_token_accuracy": 0.9840988755226135, "num_tokens": 3783388.0, "step": 1775 }, { "entropy": 2.0058682441711424, "epoch": 0.5859117840684661, "grad_norm": 2.395892858505249, "learning_rate": 1.69468447155669e-05, "loss": 0.0432, "mean_token_accuracy": 0.9877268433570862, "num_tokens": 3793954.0, "step": 1780 }, { "entropy": 1.9445565462112426, "epoch": 0.5875576036866359, "grad_norm": 2.7134897708892822, "learning_rate": 1.6927243535095995e-05, "loss": 0.077, "mean_token_accuracy": 0.9795953154563903, "num_tokens": 3804724.0, "step": 1785 }, { "entropy": 1.936465561389923, "epoch": 0.5892034233048058, "grad_norm": 1.9062567949295044, "learning_rate": 1.6907591051089313e-05, "loss": 0.0685, "mean_token_accuracy": 0.9803402423858643, "num_tokens": 3815254.0, "step": 1790 }, { "entropy": 1.9493210196495057, "epoch": 0.5908492429229757, "grad_norm": 3.1633780002593994, "learning_rate": 1.6887887409094195e-05, "loss": 0.0475, "mean_token_accuracy": 0.9850197255611419, "num_tokens": 3825885.0, "step": 1795 }, { "entropy": 1.8948396682739257, "epoch": 0.5924950625411455, "grad_norm": 3.2793633937835693, "learning_rate": 1.6868132755036875e-05, "loss": 0.0539, "mean_token_accuracy": 0.9861530363559723, "num_tokens": 3836784.0, "step": 1800 }, { "entropy": 1.935602605342865, "epoch": 0.5941408821593154, "grad_norm": 3.2881906032562256, "learning_rate": 1.6848327235221368e-05, "loss": 0.0827, "mean_token_accuracy": 0.9799518406391143, "num_tokens": 3847307.0, "step": 1805 }, { "entropy": 1.9104020118713378, "epoch": 0.5957867017774852, "grad_norm": 1.5671672821044922, "learning_rate": 1.6828470996328418e-05, "loss": 0.0566, "mean_token_accuracy": 0.9839206755161285, "num_tokens": 3858140.0, "step": 1810 }, { "entropy": 1.9541502475738526, "epoch": 0.5974325213956551, "grad_norm": 3.0334932804107666, "learning_rate": 1.680856418541439e-05, "loss": 0.0717, "mean_token_accuracy": 0.9826797246932983, "num_tokens": 3868625.0, "step": 1815 }, { "entropy": 1.9071632623672485, "epoch": 0.5990783410138248, "grad_norm": 3.8940181732177734, "learning_rate": 1.6788606949910188e-05, "loss": 0.0591, "mean_token_accuracy": 0.9818158507347107, "num_tokens": 3879340.0, "step": 1820 }, { "epoch": 0.6003949967083607, "eval_entropy": 1.9597396565445562, "eval_loss": 0.06656259298324585, "eval_mean_token_accuracy": 0.9824842680084602, "eval_num_tokens": 3887690.0, "eval_runtime": 195.6298, "eval_samples_per_second": 42.565, "eval_steps_per_second": 7.095, "step": 1824 }, { "entropy": 1.948818302154541, "epoch": 0.6007241606319947, "grad_norm": 2.146757125854492, "learning_rate": 1.6768599437620166e-05, "loss": 0.0598, "mean_token_accuracy": 0.9824907779693604, "num_tokens": 3889882.0, "step": 1825 }, { "entropy": 1.953722858428955, "epoch": 0.6023699802501645, "grad_norm": 4.7035112380981445, "learning_rate": 1.6748541796721026e-05, "loss": 0.0759, "mean_token_accuracy": 0.9829168796539307, "num_tokens": 3900763.0, "step": 1830 }, { "entropy": 1.9083998680114747, "epoch": 0.6040157998683344, "grad_norm": 3.567713737487793, "learning_rate": 1.6728434175760733e-05, "loss": 0.0602, "mean_token_accuracy": 0.9855620503425598, "num_tokens": 3911307.0, "step": 1835 }, { "entropy": 1.9374099254608155, "epoch": 0.6056616194865043, "grad_norm": 5.015064716339111, "learning_rate": 1.6708276723657396e-05, "loss": 0.0556, "mean_token_accuracy": 0.986617261171341, "num_tokens": 3921526.0, "step": 1840 }, { "entropy": 1.9269861102104187, "epoch": 0.6073074391046741, "grad_norm": 3.2826483249664307, "learning_rate": 1.6688069589698194e-05, "loss": 0.0727, "mean_token_accuracy": 0.9803520023822785, "num_tokens": 3932144.0, "step": 1845 }, { "entropy": 1.8954154014587403, "epoch": 0.608953258722844, "grad_norm": 6.194475173950195, "learning_rate": 1.6667812923538226e-05, "loss": 0.0841, "mean_token_accuracy": 0.9779143512248993, "num_tokens": 3943304.0, "step": 1850 }, { "entropy": 1.989688503742218, "epoch": 0.6105990783410138, "grad_norm": 4.787328243255615, "learning_rate": 1.664750687519945e-05, "loss": 0.0542, "mean_token_accuracy": 0.9866823434829712, "num_tokens": 3953946.0, "step": 1855 }, { "entropy": 2.018328094482422, "epoch": 0.6122448979591837, "grad_norm": 6.004918098449707, "learning_rate": 1.662715159506955e-05, "loss": 0.0683, "mean_token_accuracy": 0.9783054530620575, "num_tokens": 3964563.0, "step": 1860 }, { "entropy": 2.0147560954093935, "epoch": 0.6138907175773535, "grad_norm": 3.540149450302124, "learning_rate": 1.6606747233900816e-05, "loss": 0.0612, "mean_token_accuracy": 0.983444994688034, "num_tokens": 3974938.0, "step": 1865 }, { "entropy": 2.0170241355895997, "epoch": 0.6155365371955234, "grad_norm": 2.3520843982696533, "learning_rate": 1.6586293942809034e-05, "loss": 0.0675, "mean_token_accuracy": 0.980956619977951, "num_tokens": 3985380.0, "step": 1870 }, { "entropy": 2.022734725475311, "epoch": 0.6171823568136933, "grad_norm": 2.997664451599121, "learning_rate": 1.6565791873272373e-05, "loss": 0.0435, "mean_token_accuracy": 0.9847745478153229, "num_tokens": 3995690.0, "step": 1875 }, { "entropy": 2.0023676991462707, "epoch": 0.618828176431863, "grad_norm": 2.1239171028137207, "learning_rate": 1.6545241177130254e-05, "loss": 0.0486, "mean_token_accuracy": 0.9879401743412017, "num_tokens": 4006475.0, "step": 1880 }, { "entropy": 1.9986587166786194, "epoch": 0.620473996050033, "grad_norm": 3.1917483806610107, "learning_rate": 1.652464200658223e-05, "loss": 0.0688, "mean_token_accuracy": 0.9839587926864624, "num_tokens": 4017079.0, "step": 1885 }, { "entropy": 2.038959813117981, "epoch": 0.6221198156682027, "grad_norm": 2.038222551345825, "learning_rate": 1.650399451418686e-05, "loss": 0.0706, "mean_token_accuracy": 0.98506378531456, "num_tokens": 4027591.0, "step": 1890 }, { "entropy": 2.021359896659851, "epoch": 0.6237656352863726, "grad_norm": 3.2785909175872803, "learning_rate": 1.6483298852860584e-05, "loss": 0.063, "mean_token_accuracy": 0.9845045149326325, "num_tokens": 4038090.0, "step": 1895 }, { "entropy": 2.045257091522217, "epoch": 0.6254114549045424, "grad_norm": 1.1910496950149536, "learning_rate": 1.646255517587656e-05, "loss": 0.0417, "mean_token_accuracy": 0.9895485579967499, "num_tokens": 4048536.0, "step": 1900 }, { "entropy": 2.0391509175300597, "epoch": 0.6270572745227123, "grad_norm": 6.128629207611084, "learning_rate": 1.644176363686358e-05, "loss": 0.0824, "mean_token_accuracy": 0.9755302667617798, "num_tokens": 4059195.0, "step": 1905 }, { "entropy": 2.0614427804946898, "epoch": 0.6287030941408822, "grad_norm": 2.5974950790405273, "learning_rate": 1.6420924389804887e-05, "loss": 0.0469, "mean_token_accuracy": 0.989526915550232, "num_tokens": 4069610.0, "step": 1910 }, { "entropy": 2.0109457969665527, "epoch": 0.630348913759052, "grad_norm": 4.8466362953186035, "learning_rate": 1.6400037589037062e-05, "loss": 0.0814, "mean_token_accuracy": 0.9819941699504853, "num_tokens": 4080106.0, "step": 1915 }, { "entropy": 2.0402795314788817, "epoch": 0.6319947333772219, "grad_norm": 3.8476603031158447, "learning_rate": 1.6379103389248867e-05, "loss": 0.0909, "mean_token_accuracy": 0.9787125289440155, "num_tokens": 4090739.0, "step": 1920 }, { "entropy": 2.1076374292373656, "epoch": 0.6336405529953917, "grad_norm": 3.3376083374023438, "learning_rate": 1.63581219454801e-05, "loss": 0.0598, "mean_token_accuracy": 0.98504838347435, "num_tokens": 4101214.0, "step": 1925 }, { "entropy": 2.0517534494400023, "epoch": 0.6352863726135616, "grad_norm": 6.078008651733398, "learning_rate": 1.6337093413120463e-05, "loss": 0.0745, "mean_token_accuracy": 0.9847012758255005, "num_tokens": 4111702.0, "step": 1930 }, { "entropy": 2.0724000096321107, "epoch": 0.6369321922317314, "grad_norm": 1.9413197040557861, "learning_rate": 1.631601794790838e-05, "loss": 0.0533, "mean_token_accuracy": 0.9855819940567017, "num_tokens": 4122503.0, "step": 1935 }, { "entropy": 2.0283157587051392, "epoch": 0.6385780118499013, "grad_norm": 3.654207468032837, "learning_rate": 1.629489570592988e-05, "loss": 0.0704, "mean_token_accuracy": 0.9817177176475524, "num_tokens": 4132981.0, "step": 1940 }, { "entropy": 1.9633773922920228, "epoch": 0.640223831468071, "grad_norm": 5.509681701660156, "learning_rate": 1.6273726843617413e-05, "loss": 0.0735, "mean_token_accuracy": 0.9831919133663177, "num_tokens": 4143616.0, "step": 1945 }, { "entropy": 1.8812169671058654, "epoch": 0.6418696510862409, "grad_norm": 5.453320503234863, "learning_rate": 1.625251151774871e-05, "loss": 0.0743, "mean_token_accuracy": 0.9811854064464569, "num_tokens": 4154118.0, "step": 1950 }, { "entropy": 1.8234861969947815, "epoch": 0.6435154707044108, "grad_norm": 1.3750858306884766, "learning_rate": 1.62312498854456e-05, "loss": 0.0351, "mean_token_accuracy": 0.987557852268219, "num_tokens": 4165209.0, "step": 1955 }, { "entropy": 1.8196370124816894, "epoch": 0.6451612903225806, "grad_norm": 3.4758753776550293, "learning_rate": 1.620994210417287e-05, "loss": 0.0615, "mean_token_accuracy": 0.9843147337436676, "num_tokens": 4175834.0, "step": 1960 }, { "entropy": 1.8451374650001526, "epoch": 0.6468071099407505, "grad_norm": 0.8775233626365662, "learning_rate": 1.6188588331737086e-05, "loss": 0.1016, "mean_token_accuracy": 0.9828826725482941, "num_tokens": 4186493.0, "step": 1965 }, { "entropy": 1.8466584086418152, "epoch": 0.6484529295589203, "grad_norm": 2.0667057037353516, "learning_rate": 1.6167188726285433e-05, "loss": 0.041, "mean_token_accuracy": 0.985245656967163, "num_tokens": 4196992.0, "step": 1970 }, { "entropy": 1.820866048336029, "epoch": 0.6500987491770902, "grad_norm": 1.9866774082183838, "learning_rate": 1.6145743446304524e-05, "loss": 0.0521, "mean_token_accuracy": 0.9827068746089935, "num_tokens": 4207748.0, "step": 1975 }, { "entropy": 1.7237545490264892, "epoch": 0.65174456879526, "grad_norm": 4.054563045501709, "learning_rate": 1.6124252650619257e-05, "loss": 0.0721, "mean_token_accuracy": 0.9837478876113892, "num_tokens": 4218653.0, "step": 1980 }, { "entropy": 1.7705234289169312, "epoch": 0.6533903884134299, "grad_norm": 2.3912136554718018, "learning_rate": 1.610271649839161e-05, "loss": 0.0548, "mean_token_accuracy": 0.9835083067417145, "num_tokens": 4229354.0, "step": 1985 }, { "entropy": 1.733774983882904, "epoch": 0.6550362080315998, "grad_norm": 7.83612060546875, "learning_rate": 1.608113514911948e-05, "loss": 0.0886, "mean_token_accuracy": 0.9790404379367829, "num_tokens": 4240302.0, "step": 1990 }, { "entropy": 1.6919760227203369, "epoch": 0.6566820276497696, "grad_norm": 4.122557163238525, "learning_rate": 1.6059508762635482e-05, "loss": 0.0658, "mean_token_accuracy": 0.9874409794807434, "num_tokens": 4250965.0, "step": 1995 }, { "entropy": 1.727248990535736, "epoch": 0.6583278472679395, "grad_norm": 2.4015748500823975, "learning_rate": 1.6037837499105804e-05, "loss": 0.0619, "mean_token_accuracy": 0.9821118116378784, "num_tokens": 4261750.0, "step": 2000 }, { "entropy": 1.7900878310203552, "epoch": 0.6599736668861093, "grad_norm": 3.262315034866333, "learning_rate": 1.601612151902897e-05, "loss": 0.053, "mean_token_accuracy": 0.9853106379508972, "num_tokens": 4272360.0, "step": 2005 }, { "entropy": 1.8539373755455018, "epoch": 0.6616194865042792, "grad_norm": 3.7115604877471924, "learning_rate": 1.5994360983234698e-05, "loss": 0.0681, "mean_token_accuracy": 0.9834797859191895, "num_tokens": 4283012.0, "step": 2010 }, { "entropy": 1.888733983039856, "epoch": 0.6632653061224489, "grad_norm": 1.2049510478973389, "learning_rate": 1.5972556052882672e-05, "loss": 0.0399, "mean_token_accuracy": 0.9876511216163635, "num_tokens": 4293353.0, "step": 2015 }, { "entropy": 1.8981561660766602, "epoch": 0.6649111257406188, "grad_norm": 4.529847621917725, "learning_rate": 1.595070688946138e-05, "loss": 0.0757, "mean_token_accuracy": 0.9844479858875275, "num_tokens": 4303907.0, "step": 2020 }, { "entropy": 1.9379532098770142, "epoch": 0.6665569453587886, "grad_norm": 1.7552553415298462, "learning_rate": 1.592881365478688e-05, "loss": 0.0662, "mean_token_accuracy": 0.983349347114563, "num_tokens": 4314505.0, "step": 2025 }, { "entropy": 1.9738388657569885, "epoch": 0.6682027649769585, "grad_norm": 2.010423183441162, "learning_rate": 1.590687651100165e-05, "loss": 0.0707, "mean_token_accuracy": 0.9822420716285706, "num_tokens": 4325178.0, "step": 2030 }, { "entropy": 1.9870564699172975, "epoch": 0.6698485845951284, "grad_norm": 3.9561121463775635, "learning_rate": 1.5884895620573346e-05, "loss": 0.0568, "mean_token_accuracy": 0.9854467332363128, "num_tokens": 4335932.0, "step": 2035 }, { "entropy": 1.989486038684845, "epoch": 0.6714944042132982, "grad_norm": 2.525803565979004, "learning_rate": 1.5862871146293616e-05, "loss": 0.0593, "mean_token_accuracy": 0.9792122423648835, "num_tokens": 4346565.0, "step": 2040 }, { "entropy": 1.9590755701065063, "epoch": 0.6731402238314681, "grad_norm": 3.8919122219085693, "learning_rate": 1.5840803251276892e-05, "loss": 0.0805, "mean_token_accuracy": 0.9800050914287567, "num_tokens": 4357420.0, "step": 2045 }, { "entropy": 2.030180037021637, "epoch": 0.6747860434496379, "grad_norm": 3.5018019676208496, "learning_rate": 1.5818692098959187e-05, "loss": 0.099, "mean_token_accuracy": 0.9742182731628418, "num_tokens": 4368006.0, "step": 2050 }, { "entropy": 2.0822962045669557, "epoch": 0.6764318630678078, "grad_norm": 4.686304092407227, "learning_rate": 1.5796537853096875e-05, "loss": 0.0766, "mean_token_accuracy": 0.9759474039077759, "num_tokens": 4378686.0, "step": 2055 }, { "entropy": 2.134053909778595, "epoch": 0.6780776826859776, "grad_norm": 4.483613014221191, "learning_rate": 1.5774340677765483e-05, "loss": 0.0743, "mean_token_accuracy": 0.9787837326526642, "num_tokens": 4389381.0, "step": 2060 }, { "entropy": 2.101311683654785, "epoch": 0.6797235023041475, "grad_norm": 4.380194187164307, "learning_rate": 1.575210073735848e-05, "loss": 0.0875, "mean_token_accuracy": 0.9795165956020355, "num_tokens": 4400273.0, "step": 2065 }, { "entropy": 2.1265645384788514, "epoch": 0.6813693219223174, "grad_norm": 5.121950626373291, "learning_rate": 1.572981819658605e-05, "loss": 0.0539, "mean_token_accuracy": 0.9842699348926545, "num_tokens": 4411134.0, "step": 2070 }, { "entropy": 2.063078737258911, "epoch": 0.6830151415404871, "grad_norm": 1.915604829788208, "learning_rate": 1.5707493220473886e-05, "loss": 0.039, "mean_token_accuracy": 0.9892246127128601, "num_tokens": 4421711.0, "step": 2075 }, { "entropy": 2.0187936067581176, "epoch": 0.684660961158657, "grad_norm": 6.068004131317139, "learning_rate": 1.568512597436195e-05, "loss": 0.0724, "mean_token_accuracy": 0.9800882399082184, "num_tokens": 4432281.0, "step": 2080 }, { "entropy": 2.019049596786499, "epoch": 0.6863067807768268, "grad_norm": 3.6800379753112793, "learning_rate": 1.566271662390326e-05, "loss": 0.0666, "mean_token_accuracy": 0.9861168086528778, "num_tokens": 4442927.0, "step": 2085 }, { "entropy": 2.0945070028305053, "epoch": 0.6879526003949967, "grad_norm": 5.768988132476807, "learning_rate": 1.564026533506267e-05, "loss": 0.0637, "mean_token_accuracy": 0.9825628995895386, "num_tokens": 4453272.0, "step": 2090 }, { "entropy": 2.1388468503952027, "epoch": 0.6895984200131665, "grad_norm": 3.217071294784546, "learning_rate": 1.5617772274115618e-05, "loss": 0.0478, "mean_token_accuracy": 0.9849779725074768, "num_tokens": 4464006.0, "step": 2095 }, { "entropy": 2.1074278712272645, "epoch": 0.6912442396313364, "grad_norm": 2.122255563735962, "learning_rate": 1.559523760764692e-05, "loss": 0.1015, "mean_token_accuracy": 0.9739717364311218, "num_tokens": 4475018.0, "step": 2100 }, { "entropy": 2.1118789672851563, "epoch": 0.6928900592495063, "grad_norm": 2.033972978591919, "learning_rate": 1.5572661502549514e-05, "loss": 0.0485, "mean_token_accuracy": 0.9862671077251435, "num_tokens": 4485676.0, "step": 2105 }, { "entropy": 2.1021223068237305, "epoch": 0.6945358788676761, "grad_norm": 0.6050166487693787, "learning_rate": 1.5550044126023245e-05, "loss": 0.0395, "mean_token_accuracy": 0.9876242458820343, "num_tokens": 4496186.0, "step": 2110 }, { "entropy": 1.970346164703369, "epoch": 0.696181698485846, "grad_norm": 3.1835389137268066, "learning_rate": 1.5527385645573613e-05, "loss": 0.0538, "mean_token_accuracy": 0.9837459921836853, "num_tokens": 4507128.0, "step": 2115 }, { "entropy": 1.9633802771568298, "epoch": 0.6978275181040158, "grad_norm": 0.6275081634521484, "learning_rate": 1.5504686229010535e-05, "loss": 0.0388, "mean_token_accuracy": 0.987060683965683, "num_tokens": 4517561.0, "step": 2120 }, { "entropy": 1.889497458934784, "epoch": 0.6994733377221857, "grad_norm": 8.642451286315918, "learning_rate": 1.54819460444471e-05, "loss": 0.0617, "mean_token_accuracy": 0.9837220013141632, "num_tokens": 4528490.0, "step": 2125 }, { "entropy": 1.8756322503089904, "epoch": 0.7011191573403555, "grad_norm": 1.7640129327774048, "learning_rate": 1.545916526029833e-05, "loss": 0.0606, "mean_token_accuracy": 0.9831870555877685, "num_tokens": 4539249.0, "step": 2130 }, { "entropy": 1.84752117395401, "epoch": 0.7027649769585254, "grad_norm": 1.39552903175354, "learning_rate": 1.5436344045279935e-05, "loss": 0.079, "mean_token_accuracy": 0.9791609466075897, "num_tokens": 4550292.0, "step": 2135 }, { "entropy": 1.8886283040046692, "epoch": 0.7044107965766951, "grad_norm": 1.880411148071289, "learning_rate": 1.5413482568407044e-05, "loss": 0.0807, "mean_token_accuracy": 0.9817405939102173, "num_tokens": 4560802.0, "step": 2140 }, { "entropy": 1.8611098527908325, "epoch": 0.706056616194865, "grad_norm": 2.7298200130462646, "learning_rate": 1.539058099899299e-05, "loss": 0.0653, "mean_token_accuracy": 0.9814600467681884, "num_tokens": 4571690.0, "step": 2145 }, { "entropy": 1.791181170940399, "epoch": 0.7077024358130349, "grad_norm": 5.456151485443115, "learning_rate": 1.5367639506648006e-05, "loss": 0.0708, "mean_token_accuracy": 0.9785441160202026, "num_tokens": 4582204.0, "step": 2150 }, { "entropy": 1.752633535861969, "epoch": 0.7093482554312047, "grad_norm": 4.673461437225342, "learning_rate": 1.5344658261278013e-05, "loss": 0.0629, "mean_token_accuracy": 0.982696121931076, "num_tokens": 4592993.0, "step": 2155 }, { "entropy": 1.810698115825653, "epoch": 0.7109940750493746, "grad_norm": 4.658589839935303, "learning_rate": 1.532163743308335e-05, "loss": 0.0453, "mean_token_accuracy": 0.990430223941803, "num_tokens": 4603443.0, "step": 2160 }, { "entropy": 1.7778592824935913, "epoch": 0.7126398946675444, "grad_norm": 3.202155828475952, "learning_rate": 1.5298577192557487e-05, "loss": 0.069, "mean_token_accuracy": 0.98820481300354, "num_tokens": 4614058.0, "step": 2165 }, { "entropy": 1.7779869794845582, "epoch": 0.7142857142857143, "grad_norm": 1.4483567476272583, "learning_rate": 1.5275477710485812e-05, "loss": 0.0808, "mean_token_accuracy": 0.9856044292449951, "num_tokens": 4625236.0, "step": 2170 }, { "entropy": 1.900163722038269, "epoch": 0.7159315339038841, "grad_norm": 1.0799473524093628, "learning_rate": 1.525233915794432e-05, "loss": 0.0379, "mean_token_accuracy": 0.9869786322116851, "num_tokens": 4635887.0, "step": 2175 }, { "entropy": 1.8798472046852113, "epoch": 0.717577353522054, "grad_norm": 4.686890602111816, "learning_rate": 1.522916170629836e-05, "loss": 0.0359, "mean_token_accuracy": 0.9911372125148773, "num_tokens": 4646429.0, "step": 2180 }, { "entropy": 1.9202223777770997, "epoch": 0.7192231731402239, "grad_norm": 3.380596399307251, "learning_rate": 1.5205945527201386e-05, "loss": 0.0877, "mean_token_accuracy": 0.9812184333801269, "num_tokens": 4656951.0, "step": 2185 }, { "entropy": 1.916778802871704, "epoch": 0.7208689927583937, "grad_norm": 2.9656357765197754, "learning_rate": 1.5182690792593659e-05, "loss": 0.0866, "mean_token_accuracy": 0.9801094174385071, "num_tokens": 4667789.0, "step": 2190 }, { "entropy": 1.9705326914787293, "epoch": 0.7225148123765636, "grad_norm": 1.6277377605438232, "learning_rate": 1.515939767470098e-05, "loss": 0.0684, "mean_token_accuracy": 0.9821272671222687, "num_tokens": 4678462.0, "step": 2195 }, { "entropy": 2.0496770977973937, "epoch": 0.7241606319947334, "grad_norm": 2.6740543842315674, "learning_rate": 1.5136066346033431e-05, "loss": 0.0827, "mean_token_accuracy": 0.9822946667671204, "num_tokens": 4689142.0, "step": 2200 }, { "entropy": 1.99622882604599, "epoch": 0.7258064516129032, "grad_norm": 2.084319829940796, "learning_rate": 1.5112696979384076e-05, "loss": 0.0622, "mean_token_accuracy": 0.9843969225883484, "num_tokens": 4699742.0, "step": 2205 }, { "entropy": 1.9976887702941895, "epoch": 0.727452271231073, "grad_norm": 4.397096633911133, "learning_rate": 1.5089289747827698e-05, "loss": 0.0672, "mean_token_accuracy": 0.9822742283344269, "num_tokens": 4710529.0, "step": 2210 }, { "entropy": 2.00170783996582, "epoch": 0.7290980908492429, "grad_norm": 1.066505789756775, "learning_rate": 1.5065844824719498e-05, "loss": 0.0403, "mean_token_accuracy": 0.9920450508594513, "num_tokens": 4721625.0, "step": 2215 }, { "entropy": 2.0141910314559937, "epoch": 0.7307439104674127, "grad_norm": 3.045076370239258, "learning_rate": 1.504236238369383e-05, "loss": 0.0529, "mean_token_accuracy": 0.9883207738399505, "num_tokens": 4732195.0, "step": 2220 }, { "entropy": 2.031786823272705, "epoch": 0.7323897300855826, "grad_norm": 5.866887092590332, "learning_rate": 1.5018842598662913e-05, "loss": 0.08, "mean_token_accuracy": 0.9810294449329376, "num_tokens": 4742750.0, "step": 2225 }, { "entropy": 2.0390859127044676, "epoch": 0.7340355497037525, "grad_norm": 3.7776222229003906, "learning_rate": 1.499528564381553e-05, "loss": 0.0844, "mean_token_accuracy": 0.9762147605419159, "num_tokens": 4753095.0, "step": 2230 }, { "entropy": 2.058468997478485, "epoch": 0.7356813693219223, "grad_norm": 2.748897075653076, "learning_rate": 1.497169169361574e-05, "loss": 0.0628, "mean_token_accuracy": 0.9859977424144745, "num_tokens": 4763609.0, "step": 2235 }, { "entropy": 1.9909031867980957, "epoch": 0.7373271889400922, "grad_norm": 1.4906212091445923, "learning_rate": 1.4948060922801609e-05, "loss": 0.0506, "mean_token_accuracy": 0.9845727860927582, "num_tokens": 4774709.0, "step": 2240 }, { "entropy": 1.9794199824333192, "epoch": 0.738973008558262, "grad_norm": 5.407120227813721, "learning_rate": 1.492439350638388e-05, "loss": 0.0638, "mean_token_accuracy": 0.9835636436939239, "num_tokens": 4785745.0, "step": 2245 }, { "entropy": 2.052793502807617, "epoch": 0.7406188281764319, "grad_norm": 0.8577380776405334, "learning_rate": 1.4900689619644709e-05, "loss": 0.0635, "mean_token_accuracy": 0.9855947732925415, "num_tokens": 4796471.0, "step": 2250 }, { "entropy": 2.1061079859733582, "epoch": 0.7422646477946017, "grad_norm": 3.701122283935547, "learning_rate": 1.4876949438136348e-05, "loss": 0.0884, "mean_token_accuracy": 0.9817906200885773, "num_tokens": 4807169.0, "step": 2255 }, { "entropy": 2.124563980102539, "epoch": 0.7439104674127716, "grad_norm": 0.8693974018096924, "learning_rate": 1.4853173137679845e-05, "loss": 0.0565, "mean_token_accuracy": 0.9871229946613311, "num_tokens": 4818068.0, "step": 2260 }, { "entropy": 2.1583971261978148, "epoch": 0.7455562870309415, "grad_norm": 3.7275748252868652, "learning_rate": 1.4829360894363755e-05, "loss": 0.0685, "mean_token_accuracy": 0.981871497631073, "num_tokens": 4829134.0, "step": 2265 }, { "entropy": 2.1540443658828736, "epoch": 0.7472021066491112, "grad_norm": 5.0215373039245605, "learning_rate": 1.4805512884542828e-05, "loss": 0.1157, "mean_token_accuracy": 0.9756322801113129, "num_tokens": 4839612.0, "step": 2270 }, { "entropy": 2.0997665166854858, "epoch": 0.7488479262672811, "grad_norm": 1.8622218370437622, "learning_rate": 1.4781629284836689e-05, "loss": 0.076, "mean_token_accuracy": 0.9799229741096497, "num_tokens": 4850560.0, "step": 2275 }, { "entropy": 2.082018828392029, "epoch": 0.7504937458854509, "grad_norm": 3.323389768600464, "learning_rate": 1.4757710272128562e-05, "loss": 0.0679, "mean_token_accuracy": 0.9854858696460724, "num_tokens": 4861281.0, "step": 2280 }, { "entropy": 2.145115876197815, "epoch": 0.7521395655036208, "grad_norm": 3.6958255767822266, "learning_rate": 1.4733756023563932e-05, "loss": 0.0801, "mean_token_accuracy": 0.9791579902172088, "num_tokens": 4872072.0, "step": 2285 }, { "entropy": 2.2006231546401978, "epoch": 0.7537853851217906, "grad_norm": 2.5469164848327637, "learning_rate": 1.4709766716549246e-05, "loss": 0.0564, "mean_token_accuracy": 0.9853972256183624, "num_tokens": 4882541.0, "step": 2290 }, { "entropy": 2.155518102645874, "epoch": 0.7554312047399605, "grad_norm": 3.5733323097229004, "learning_rate": 1.4685742528750584e-05, "loss": 0.0609, "mean_token_accuracy": 0.9829520225524903, "num_tokens": 4893417.0, "step": 2295 }, { "entropy": 2.142457294464111, "epoch": 0.7570770243581304, "grad_norm": 3.7663276195526123, "learning_rate": 1.4661683638092375e-05, "loss": 0.0666, "mean_token_accuracy": 0.9813594579696655, "num_tokens": 4904063.0, "step": 2300 }, { "entropy": 2.1797530174255373, "epoch": 0.7587228439763002, "grad_norm": 3.873552083969116, "learning_rate": 1.4637590222756041e-05, "loss": 0.0482, "mean_token_accuracy": 0.9882281720638275, "num_tokens": 4914583.0, "step": 2305 }, { "entropy": 2.1009862065315246, "epoch": 0.7603686635944701, "grad_norm": 2.9890084266662598, "learning_rate": 1.4613462461178714e-05, "loss": 0.0542, "mean_token_accuracy": 0.9857582569122314, "num_tokens": 4925125.0, "step": 2310 }, { "entropy": 2.1062487721443177, "epoch": 0.7620144832126399, "grad_norm": 2.7217583656311035, "learning_rate": 1.4589300532051875e-05, "loss": 0.0649, "mean_token_accuracy": 0.987279736995697, "num_tokens": 4935529.0, "step": 2315 }, { "entropy": 2.0136227369308473, "epoch": 0.7636603028308098, "grad_norm": 6.62637996673584, "learning_rate": 1.4565104614320065e-05, "loss": 0.0809, "mean_token_accuracy": 0.9829951286315918, "num_tokens": 4946511.0, "step": 2320 }, { "entropy": 2.0690441846847536, "epoch": 0.7653061224489796, "grad_norm": 4.281233787536621, "learning_rate": 1.4540874887179546e-05, "loss": 0.0676, "mean_token_accuracy": 0.9836612105369568, "num_tokens": 4957109.0, "step": 2325 }, { "entropy": 2.0863526821136475, "epoch": 0.7669519420671495, "grad_norm": 2.6421709060668945, "learning_rate": 1.451661153007697e-05, "loss": 0.0828, "mean_token_accuracy": 0.9836499035358429, "num_tokens": 4967988.0, "step": 2330 }, { "entropy": 2.0738728761672975, "epoch": 0.7685977616853192, "grad_norm": 1.4690742492675781, "learning_rate": 1.4492314722708057e-05, "loss": 0.0488, "mean_token_accuracy": 0.9871618390083313, "num_tokens": 4978584.0, "step": 2335 }, { "entropy": 2.100434124469757, "epoch": 0.7702435813034891, "grad_norm": 3.107667922973633, "learning_rate": 1.4467984645016259e-05, "loss": 0.0678, "mean_token_accuracy": 0.9847745180130005, "num_tokens": 4989176.0, "step": 2340 }, { "entropy": 2.0772449016571044, "epoch": 0.771889400921659, "grad_norm": 2.9565625190734863, "learning_rate": 1.4443621477191434e-05, "loss": 0.0564, "mean_token_accuracy": 0.9867813110351562, "num_tokens": 4999990.0, "step": 2345 }, { "entropy": 2.0808992266654966, "epoch": 0.7735352205398288, "grad_norm": 4.1320390701293945, "learning_rate": 1.4419225399668504e-05, "loss": 0.0641, "mean_token_accuracy": 0.982687771320343, "num_tokens": 5010626.0, "step": 2350 }, { "entropy": 2.0873427987098694, "epoch": 0.7751810401579987, "grad_norm": 7.7029595375061035, "learning_rate": 1.4394796593126121e-05, "loss": 0.0628, "mean_token_accuracy": 0.9848801851272583, "num_tokens": 5021336.0, "step": 2355 }, { "entropy": 2.007732594013214, "epoch": 0.7768268597761685, "grad_norm": 1.4012236595153809, "learning_rate": 1.4370335238485336e-05, "loss": 0.042, "mean_token_accuracy": 0.9886705160140992, "num_tokens": 5032297.0, "step": 2360 }, { "entropy": 2.067774224281311, "epoch": 0.7784726793943384, "grad_norm": 3.532789945602417, "learning_rate": 1.4345841516908244e-05, "loss": 0.093, "mean_token_accuracy": 0.9788289248943329, "num_tokens": 5042834.0, "step": 2365 }, { "entropy": 2.1315778255462647, "epoch": 0.7801184990125082, "grad_norm": 3.001561403274536, "learning_rate": 1.4321315609796656e-05, "loss": 0.0596, "mean_token_accuracy": 0.985314530134201, "num_tokens": 5053499.0, "step": 2370 }, { "entropy": 2.088835906982422, "epoch": 0.7817643186306781, "grad_norm": 1.8496758937835693, "learning_rate": 1.4296757698790752e-05, "loss": 0.0544, "mean_token_accuracy": 0.9835005939006806, "num_tokens": 5064232.0, "step": 2375 }, { "entropy": 2.1136799812316895, "epoch": 0.783410138248848, "grad_norm": 3.9325993061065674, "learning_rate": 1.4272167965767735e-05, "loss": 0.0599, "mean_token_accuracy": 0.9857981920242309, "num_tokens": 5074681.0, "step": 2380 }, { "entropy": 2.082188093662262, "epoch": 0.7850559578670178, "grad_norm": 2.6712751388549805, "learning_rate": 1.424754659284048e-05, "loss": 0.0425, "mean_token_accuracy": 0.9875874161720276, "num_tokens": 5085364.0, "step": 2385 }, { "entropy": 2.0782926797866823, "epoch": 0.7867017774851877, "grad_norm": 2.5082037448883057, "learning_rate": 1.422289376235619e-05, "loss": 0.0924, "mean_token_accuracy": 0.9773596286773681, "num_tokens": 5095986.0, "step": 2390 }, { "entropy": 2.1379014015197755, "epoch": 0.7883475971033574, "grad_norm": 3.3415675163269043, "learning_rate": 1.4198209656895045e-05, "loss": 0.0538, "mean_token_accuracy": 0.9802229166030884, "num_tokens": 5106549.0, "step": 2395 }, { "entropy": 2.1798630714416505, "epoch": 0.7899934167215273, "grad_norm": 2.5034339427948, "learning_rate": 1.4173494459268848e-05, "loss": 0.0529, "mean_token_accuracy": 0.9868557870388031, "num_tokens": 5117206.0, "step": 2400 }, { "entropy": 2.1523491382598876, "epoch": 0.7916392363396971, "grad_norm": 2.866143226623535, "learning_rate": 1.4148748352519677e-05, "loss": 0.0434, "mean_token_accuracy": 0.9868989825248718, "num_tokens": 5127670.0, "step": 2405 }, { "entropy": 2.1185644268989563, "epoch": 0.793285055957867, "grad_norm": 3.293668270111084, "learning_rate": 1.4123971519918516e-05, "loss": 0.0704, "mean_token_accuracy": 0.9803002297878265, "num_tokens": 5138300.0, "step": 2410 }, { "entropy": 2.1225445747375487, "epoch": 0.7949308755760369, "grad_norm": 3.2386057376861572, "learning_rate": 1.4099164144963914e-05, "loss": 0.0745, "mean_token_accuracy": 0.976332038640976, "num_tokens": 5148898.0, "step": 2415 }, { "entropy": 2.0981351375579833, "epoch": 0.7965766951942067, "grad_norm": 2.420224666595459, "learning_rate": 1.4074326411380617e-05, "loss": 0.0554, "mean_token_accuracy": 0.9834634006023407, "num_tokens": 5159779.0, "step": 2420 }, { "entropy": 2.102435576915741, "epoch": 0.7982225148123766, "grad_norm": 4.499454975128174, "learning_rate": 1.4049458503118206e-05, "loss": 0.0728, "mean_token_accuracy": 0.9828149318695069, "num_tokens": 5170439.0, "step": 2425 }, { "entropy": 2.02862309217453, "epoch": 0.7998683344305464, "grad_norm": 1.3120136260986328, "learning_rate": 1.4024560604349738e-05, "loss": 0.07, "mean_token_accuracy": 0.9814626455307007, "num_tokens": 5181355.0, "step": 2430 }, { "epoch": 0.8005266622778143, "eval_entropy": 2.0658205878150566, "eval_loss": 0.05905340239405632, "eval_mean_token_accuracy": 0.9841244661327054, "eval_num_tokens": 5185715.0, "eval_runtime": 197.2045, "eval_samples_per_second": 42.225, "eval_steps_per_second": 7.038, "step": 2432 }, { "entropy": 1.9992692589759826, "epoch": 0.8015141540487163, "grad_norm": 2.716399669647217, "learning_rate": 1.3999632899470377e-05, "loss": 0.0415, "mean_token_accuracy": 0.9883556425571441, "num_tokens": 5192402.0, "step": 2435 }, { "entropy": 2.073114001750946, "epoch": 0.8031599736668861, "grad_norm": 4.84559440612793, "learning_rate": 1.3974675573096046e-05, "loss": 0.0516, "mean_token_accuracy": 0.9831765830516815, "num_tokens": 5203184.0, "step": 2440 }, { "entropy": 2.0994094610214233, "epoch": 0.804805793285056, "grad_norm": 3.8086822032928467, "learning_rate": 1.3949688810062033e-05, "loss": 0.0714, "mean_token_accuracy": 0.9829389989376068, "num_tokens": 5213537.0, "step": 2445 }, { "entropy": 2.1006295680999756, "epoch": 0.8064516129032258, "grad_norm": 4.448069095611572, "learning_rate": 1.3924672795421638e-05, "loss": 0.0567, "mean_token_accuracy": 0.9861303806304932, "num_tokens": 5224028.0, "step": 2450 }, { "entropy": 2.0546552419662474, "epoch": 0.8080974325213957, "grad_norm": 2.4029576778411865, "learning_rate": 1.389962771444481e-05, "loss": 0.0561, "mean_token_accuracy": 0.986747932434082, "num_tokens": 5234600.0, "step": 2455 }, { "entropy": 2.0826905846595762, "epoch": 0.8097432521395656, "grad_norm": 3.8037831783294678, "learning_rate": 1.3874553752616747e-05, "loss": 0.0647, "mean_token_accuracy": 0.9837893486022949, "num_tokens": 5245430.0, "step": 2460 }, { "entropy": 2.101279282569885, "epoch": 0.8113890717577353, "grad_norm": 4.736576080322266, "learning_rate": 1.3849451095636555e-05, "loss": 0.0393, "mean_token_accuracy": 0.9862057268619537, "num_tokens": 5256031.0, "step": 2465 }, { "entropy": 2.0845310568809508, "epoch": 0.8130348913759052, "grad_norm": 3.912712812423706, "learning_rate": 1.3824319929415856e-05, "loss": 0.098, "mean_token_accuracy": 0.9780865371227264, "num_tokens": 5266625.0, "step": 2470 }, { "entropy": 1.9965445041656493, "epoch": 0.814680710994075, "grad_norm": 2.1379354000091553, "learning_rate": 1.3799160440077407e-05, "loss": 0.0305, "mean_token_accuracy": 0.9910949647426606, "num_tokens": 5277179.0, "step": 2475 }, { "entropy": 1.9245966672897339, "epoch": 0.8163265306122449, "grad_norm": 3.4994077682495117, "learning_rate": 1.3773972813953726e-05, "loss": 0.0606, "mean_token_accuracy": 0.9876502931118012, "num_tokens": 5287715.0, "step": 2480 }, { "entropy": 1.872160291671753, "epoch": 0.8179723502304147, "grad_norm": 4.733576774597168, "learning_rate": 1.3748757237585729e-05, "loss": 0.0691, "mean_token_accuracy": 0.9822778880596161, "num_tokens": 5298582.0, "step": 2485 }, { "entropy": 1.8784523367881776, "epoch": 0.8196181698485846, "grad_norm": 3.7372028827667236, "learning_rate": 1.372351389772131e-05, "loss": 0.0713, "mean_token_accuracy": 0.9856967926025391, "num_tokens": 5309140.0, "step": 2490 }, { "entropy": 1.9651299357414245, "epoch": 0.8212639894667545, "grad_norm": 3.022817611694336, "learning_rate": 1.3698242981314e-05, "loss": 0.0547, "mean_token_accuracy": 0.9867731034755707, "num_tokens": 5319522.0, "step": 2495 }, { "entropy": 1.943724513053894, "epoch": 0.8229098090849243, "grad_norm": 3.299440622329712, "learning_rate": 1.3672944675521555e-05, "loss": 0.0434, "mean_token_accuracy": 0.985489410161972, "num_tokens": 5330359.0, "step": 2500 }, { "entropy": 1.9148884296417237, "epoch": 0.8245556287030942, "grad_norm": 2.022271156311035, "learning_rate": 1.3647619167704578e-05, "loss": 0.0457, "mean_token_accuracy": 0.9900146067142487, "num_tokens": 5341225.0, "step": 2505 }, { "entropy": 1.8972573161125184, "epoch": 0.826201448321264, "grad_norm": 1.6173745393753052, "learning_rate": 1.3622266645425135e-05, "loss": 0.0371, "mean_token_accuracy": 0.9884720623493195, "num_tokens": 5351753.0, "step": 2510 }, { "entropy": 1.845130443572998, "epoch": 0.8278472679394339, "grad_norm": 3.775874376296997, "learning_rate": 1.359688729644536e-05, "loss": 0.0575, "mean_token_accuracy": 0.985155212879181, "num_tokens": 5362524.0, "step": 2515 }, { "entropy": 1.8224045038223267, "epoch": 0.8294930875576036, "grad_norm": 2.454981803894043, "learning_rate": 1.3571481308726064e-05, "loss": 0.0544, "mean_token_accuracy": 0.9880367636680603, "num_tokens": 5373427.0, "step": 2520 }, { "entropy": 1.854708468914032, "epoch": 0.8311389071757735, "grad_norm": 4.371317386627197, "learning_rate": 1.3546048870425356e-05, "loss": 0.0705, "mean_token_accuracy": 0.9838429808616638, "num_tokens": 5383996.0, "step": 2525 }, { "entropy": 1.8091017842292785, "epoch": 0.8327847267939433, "grad_norm": 2.3962645530700684, "learning_rate": 1.3520590169897232e-05, "loss": 0.0525, "mean_token_accuracy": 0.9856068968772889, "num_tokens": 5394554.0, "step": 2530 }, { "entropy": 1.827188992500305, "epoch": 0.8344305464121132, "grad_norm": 3.678561210632324, "learning_rate": 1.3495105395690185e-05, "loss": 0.0538, "mean_token_accuracy": 0.9860463380813599, "num_tokens": 5405311.0, "step": 2535 }, { "entropy": 1.8938659191131593, "epoch": 0.8360763660302831, "grad_norm": 4.154452323913574, "learning_rate": 1.3469594736545816e-05, "loss": 0.0566, "mean_token_accuracy": 0.982678347826004, "num_tokens": 5415855.0, "step": 2540 }, { "entropy": 1.8611347198486328, "epoch": 0.8377221856484529, "grad_norm": 5.7794365882873535, "learning_rate": 1.344405838139743e-05, "loss": 0.06, "mean_token_accuracy": 0.9849008440971374, "num_tokens": 5426534.0, "step": 2545 }, { "entropy": 1.8572628855705262, "epoch": 0.8393680052666228, "grad_norm": 3.2509214878082275, "learning_rate": 1.341849651936864e-05, "loss": 0.0449, "mean_token_accuracy": 0.9872718393802643, "num_tokens": 5437422.0, "step": 2550 }, { "entropy": 1.9231997847557067, "epoch": 0.8410138248847926, "grad_norm": 1.5836360454559326, "learning_rate": 1.3392909339771957e-05, "loss": 0.0687, "mean_token_accuracy": 0.9836991250514984, "num_tokens": 5447835.0, "step": 2555 }, { "entropy": 2.006228494644165, "epoch": 0.8426596445029625, "grad_norm": 2.372229814529419, "learning_rate": 1.3367297032107404e-05, "loss": 0.0748, "mean_token_accuracy": 0.9789046466350555, "num_tokens": 5458430.0, "step": 2560 }, { "entropy": 2.013643407821655, "epoch": 0.8443054641211323, "grad_norm": 2.6997487545013428, "learning_rate": 1.33416597860611e-05, "loss": 0.0693, "mean_token_accuracy": 0.9819782853126526, "num_tokens": 5469093.0, "step": 2565 }, { "entropy": 2.086654710769653, "epoch": 0.8459512837393022, "grad_norm": 5.783359050750732, "learning_rate": 1.331599779150386e-05, "loss": 0.0602, "mean_token_accuracy": 0.9840903103351593, "num_tokens": 5479883.0, "step": 2570 }, { "entropy": 1.9687761425971986, "epoch": 0.8475971033574721, "grad_norm": 1.4621000289916992, "learning_rate": 1.3290311238489784e-05, "loss": 0.037, "mean_token_accuracy": 0.9909431576728821, "num_tokens": 5490442.0, "step": 2575 }, { "entropy": 1.9323675155639648, "epoch": 0.8492429229756419, "grad_norm": 3.7601616382598877, "learning_rate": 1.3264600317254854e-05, "loss": 0.0535, "mean_token_accuracy": 0.9863576173782349, "num_tokens": 5501192.0, "step": 2580 }, { "entropy": 1.993829894065857, "epoch": 0.8508887425938118, "grad_norm": 2.846726655960083, "learning_rate": 1.3238865218215535e-05, "loss": 0.0735, "mean_token_accuracy": 0.9818755805492401, "num_tokens": 5511779.0, "step": 2585 }, { "entropy": 1.9815747618675232, "epoch": 0.8525345622119815, "grad_norm": 2.043933868408203, "learning_rate": 1.3213106131967339e-05, "loss": 0.0394, "mean_token_accuracy": 0.9880852580070496, "num_tokens": 5522208.0, "step": 2590 }, { "entropy": 1.9507970690727234, "epoch": 0.8541803818301514, "grad_norm": 3.3365821838378906, "learning_rate": 1.3187323249283439e-05, "loss": 0.0449, "mean_token_accuracy": 0.987623393535614, "num_tokens": 5532774.0, "step": 2595 }, { "entropy": 1.8794512629508973, "epoch": 0.8558262014483212, "grad_norm": 4.568225860595703, "learning_rate": 1.316151676111324e-05, "loss": 0.047, "mean_token_accuracy": 0.9890295684337616, "num_tokens": 5543614.0, "step": 2600 }, { "entropy": 1.8917754292488098, "epoch": 0.8574720210664911, "grad_norm": 1.8697177171707153, "learning_rate": 1.313568685858097e-05, "loss": 0.0604, "mean_token_accuracy": 0.9812738180160523, "num_tokens": 5554191.0, "step": 2605 }, { "entropy": 1.8788981676101684, "epoch": 0.859117840684661, "grad_norm": 2.4049696922302246, "learning_rate": 1.3109833732984272e-05, "loss": 0.0368, "mean_token_accuracy": 0.9879702389240265, "num_tokens": 5564836.0, "step": 2610 }, { "entropy": 1.8883351802825927, "epoch": 0.8607636603028308, "grad_norm": 3.053053379058838, "learning_rate": 1.3083957575792772e-05, "loss": 0.0608, "mean_token_accuracy": 0.9815441846847535, "num_tokens": 5575491.0, "step": 2615 }, { "entropy": 1.8524894714355469, "epoch": 0.8624094799210007, "grad_norm": 2.5347635746002197, "learning_rate": 1.3058058578646673e-05, "loss": 0.0676, "mean_token_accuracy": 0.9812525391578675, "num_tokens": 5586127.0, "step": 2620 }, { "entropy": 1.9090933799743652, "epoch": 0.8640552995391705, "grad_norm": 4.13297176361084, "learning_rate": 1.3032136933355336e-05, "loss": 0.0675, "mean_token_accuracy": 0.9843396723270417, "num_tokens": 5596745.0, "step": 2625 }, { "entropy": 1.8861599802970885, "epoch": 0.8657011191573404, "grad_norm": 3.5086593627929688, "learning_rate": 1.3006192831895846e-05, "loss": 0.0521, "mean_token_accuracy": 0.9839343369007111, "num_tokens": 5607882.0, "step": 2630 }, { "entropy": 1.9625224947929383, "epoch": 0.8673469387755102, "grad_norm": 2.0559775829315186, "learning_rate": 1.2980226466411605e-05, "loss": 0.06, "mean_token_accuracy": 0.9822891354560852, "num_tokens": 5618184.0, "step": 2635 }, { "entropy": 1.8167145729064942, "epoch": 0.8689927583936801, "grad_norm": 11.442062377929688, "learning_rate": 1.2954238029210906e-05, "loss": 0.0468, "mean_token_accuracy": 0.9872013688087463, "num_tokens": 5628846.0, "step": 2640 }, { "entropy": 1.7704446077346803, "epoch": 0.8706385780118499, "grad_norm": 3.554111957550049, "learning_rate": 1.2928227712765504e-05, "loss": 0.0784, "mean_token_accuracy": 0.9803166925907135, "num_tokens": 5639683.0, "step": 2645 }, { "entropy": 1.746169638633728, "epoch": 0.8722843976300197, "grad_norm": 4.623286247253418, "learning_rate": 1.290219570970919e-05, "loss": 0.0622, "mean_token_accuracy": 0.9823844611644745, "num_tokens": 5650210.0, "step": 2650 }, { "entropy": 1.671170949935913, "epoch": 0.8739302172481896, "grad_norm": 3.731656551361084, "learning_rate": 1.2876142212836373e-05, "loss": 0.0572, "mean_token_accuracy": 0.9831017911434173, "num_tokens": 5660768.0, "step": 2655 }, { "entropy": 1.7002404808998108, "epoch": 0.8755760368663594, "grad_norm": 3.3224618434906006, "learning_rate": 1.2850067415100643e-05, "loss": 0.0796, "mean_token_accuracy": 0.9770964682102203, "num_tokens": 5671563.0, "step": 2660 }, { "entropy": 1.7098248720169067, "epoch": 0.8772218564845293, "grad_norm": 4.906748294830322, "learning_rate": 1.2823971509613338e-05, "loss": 0.0776, "mean_token_accuracy": 0.9807847082614899, "num_tokens": 5682040.0, "step": 2665 }, { "entropy": 1.8245792269706727, "epoch": 0.8788676761026991, "grad_norm": 3.9868807792663574, "learning_rate": 1.2797854689642136e-05, "loss": 0.074, "mean_token_accuracy": 0.9827134788036347, "num_tokens": 5692607.0, "step": 2670 }, { "entropy": 1.8712535619735717, "epoch": 0.880513495720869, "grad_norm": 2.729797601699829, "learning_rate": 1.2771717148609598e-05, "loss": 0.0657, "mean_token_accuracy": 0.9844698071479797, "num_tokens": 5703254.0, "step": 2675 }, { "entropy": 1.8171938061714172, "epoch": 0.8821593153390388, "grad_norm": 2.3974361419677734, "learning_rate": 1.2745559080091749e-05, "loss": 0.0492, "mean_token_accuracy": 0.98358274102211, "num_tokens": 5713800.0, "step": 2680 }, { "entropy": 1.7879636526107787, "epoch": 0.8838051349572087, "grad_norm": 0.965469479560852, "learning_rate": 1.2719380677816648e-05, "loss": 0.0679, "mean_token_accuracy": 0.9823775053024292, "num_tokens": 5724150.0, "step": 2685 }, { "entropy": 1.752970778942108, "epoch": 0.8854509545753786, "grad_norm": 3.1124043464660645, "learning_rate": 1.2693182135662933e-05, "loss": 0.0431, "mean_token_accuracy": 0.9858216881752014, "num_tokens": 5734916.0, "step": 2690 }, { "entropy": 1.8105898737907409, "epoch": 0.8870967741935484, "grad_norm": 1.5042496919631958, "learning_rate": 1.2666963647658413e-05, "loss": 0.0514, "mean_token_accuracy": 0.9884253561496734, "num_tokens": 5745344.0, "step": 2695 }, { "entropy": 1.7806416988372802, "epoch": 0.8887425938117183, "grad_norm": 2.657801389694214, "learning_rate": 1.2640725407978607e-05, "loss": 0.0344, "mean_token_accuracy": 0.9910404562950135, "num_tokens": 5755871.0, "step": 2700 }, { "entropy": 1.7876773238182069, "epoch": 0.8903884134298881, "grad_norm": 2.7087864875793457, "learning_rate": 1.2614467610945323e-05, "loss": 0.0704, "mean_token_accuracy": 0.9819835543632507, "num_tokens": 5766351.0, "step": 2705 }, { "entropy": 1.7520806312561035, "epoch": 0.892034233048058, "grad_norm": 2.7425425052642822, "learning_rate": 1.2588190451025209e-05, "loss": 0.0391, "mean_token_accuracy": 0.9887621998786926, "num_tokens": 5776778.0, "step": 2710 }, { "entropy": 1.7028562068939208, "epoch": 0.8936800526662277, "grad_norm": 1.6712589263916016, "learning_rate": 1.2561894122828315e-05, "loss": 0.0588, "mean_token_accuracy": 0.9863285422325134, "num_tokens": 5787425.0, "step": 2715 }, { "entropy": 1.6849963307380675, "epoch": 0.8953258722843976, "grad_norm": 4.329936504364014, "learning_rate": 1.2535578821106648e-05, "loss": 0.0553, "mean_token_accuracy": 0.984381890296936, "num_tokens": 5798063.0, "step": 2720 }, { "entropy": 1.7037844061851501, "epoch": 0.8969716919025674, "grad_norm": 2.813596248626709, "learning_rate": 1.2509244740752748e-05, "loss": 0.0612, "mean_token_accuracy": 0.9847327530384063, "num_tokens": 5808513.0, "step": 2725 }, { "entropy": 1.763570249080658, "epoch": 0.8986175115207373, "grad_norm": 5.475368976593018, "learning_rate": 1.2482892076798216e-05, "loss": 0.0659, "mean_token_accuracy": 0.9854476451873779, "num_tokens": 5819032.0, "step": 2730 }, { "entropy": 1.8239656686782837, "epoch": 0.9002633311389072, "grad_norm": 2.5298304557800293, "learning_rate": 1.2456521024412287e-05, "loss": 0.0817, "mean_token_accuracy": 0.983061420917511, "num_tokens": 5829661.0, "step": 2735 }, { "entropy": 1.9393372416496277, "epoch": 0.901909150757077, "grad_norm": 2.5725841522216797, "learning_rate": 1.243013177890039e-05, "loss": 0.0466, "mean_token_accuracy": 0.987521517276764, "num_tokens": 5840288.0, "step": 2740 }, { "entropy": 2.0387110352516173, "epoch": 0.9035549703752469, "grad_norm": 2.8396382331848145, "learning_rate": 1.2403724535702679e-05, "loss": 0.1089, "mean_token_accuracy": 0.9775011360645294, "num_tokens": 5850912.0, "step": 2745 }, { "entropy": 2.0455712914466857, "epoch": 0.9052007899934167, "grad_norm": 3.0769193172454834, "learning_rate": 1.2377299490392618e-05, "loss": 0.0575, "mean_token_accuracy": 0.9838365972042084, "num_tokens": 5861530.0, "step": 2750 }, { "entropy": 2.0284469962120055, "epoch": 0.9068466096115866, "grad_norm": 3.171985626220703, "learning_rate": 1.23508568386755e-05, "loss": 0.0433, "mean_token_accuracy": 0.986922162771225, "num_tokens": 5872294.0, "step": 2755 }, { "entropy": 2.027714431285858, "epoch": 0.9084924292297564, "grad_norm": 1.4176881313323975, "learning_rate": 1.2324396776387014e-05, "loss": 0.0582, "mean_token_accuracy": 0.9820313692092896, "num_tokens": 5882926.0, "step": 2760 }, { "entropy": 1.9616046786308288, "epoch": 0.9101382488479263, "grad_norm": 1.9177738428115845, "learning_rate": 1.2297919499491797e-05, "loss": 0.0621, "mean_token_accuracy": 0.9839075744152069, "num_tokens": 5893515.0, "step": 2765 }, { "entropy": 1.8943446040153504, "epoch": 0.9117840684660962, "grad_norm": 3.43756103515625, "learning_rate": 1.2271425204081981e-05, "loss": 0.0688, "mean_token_accuracy": 0.9814131796360016, "num_tokens": 5904107.0, "step": 2770 }, { "entropy": 1.8612539172172546, "epoch": 0.913429888084266, "grad_norm": 2.007723808288574, "learning_rate": 1.2244914086375726e-05, "loss": 0.0566, "mean_token_accuracy": 0.9836967706680297, "num_tokens": 5914642.0, "step": 2775 }, { "entropy": 1.8655507802963256, "epoch": 0.9150757077024358, "grad_norm": 2.231388568878174, "learning_rate": 1.2218386342715793e-05, "loss": 0.0776, "mean_token_accuracy": 0.9767457246780396, "num_tokens": 5925381.0, "step": 2780 }, { "entropy": 1.9478851079940795, "epoch": 0.9167215273206056, "grad_norm": 3.511413335800171, "learning_rate": 1.2191842169568067e-05, "loss": 0.0627, "mean_token_accuracy": 0.9826519787311554, "num_tokens": 5936411.0, "step": 2785 }, { "entropy": 1.9535033702850342, "epoch": 0.9183673469387755, "grad_norm": 1.3818976879119873, "learning_rate": 1.2165281763520106e-05, "loss": 0.0284, "mean_token_accuracy": 0.9931237936019898, "num_tokens": 5947401.0, "step": 2790 }, { "entropy": 2.046694076061249, "epoch": 0.9200131665569453, "grad_norm": 2.708840847015381, "learning_rate": 1.2138705321279709e-05, "loss": 0.0357, "mean_token_accuracy": 0.9874314427375793, "num_tokens": 5957977.0, "step": 2795 }, { "entropy": 1.9919468641281128, "epoch": 0.9216589861751152, "grad_norm": 3.4317140579223633, "learning_rate": 1.2112113039673418e-05, "loss": 0.0491, "mean_token_accuracy": 0.9871409773826599, "num_tokens": 5968581.0, "step": 2800 }, { "entropy": 2.0010817289352416, "epoch": 0.9233048057932851, "grad_norm": 3.1390058994293213, "learning_rate": 1.2085505115645095e-05, "loss": 0.0463, "mean_token_accuracy": 0.9875739812850952, "num_tokens": 5979403.0, "step": 2805 }, { "entropy": 1.9707128882408143, "epoch": 0.9249506254114549, "grad_norm": 2.6084043979644775, "learning_rate": 1.2058881746254447e-05, "loss": 0.0593, "mean_token_accuracy": 0.9853511095046997, "num_tokens": 5990059.0, "step": 2810 }, { "entropy": 2.035856246948242, "epoch": 0.9265964450296248, "grad_norm": 1.959696888923645, "learning_rate": 1.203224312867557e-05, "loss": 0.0434, "mean_token_accuracy": 0.9854652583599091, "num_tokens": 6000522.0, "step": 2815 }, { "entropy": 2.0164458394050597, "epoch": 0.9282422646477946, "grad_norm": 5.868241786956787, "learning_rate": 1.2005589460195486e-05, "loss": 0.0773, "mean_token_accuracy": 0.9825805127620697, "num_tokens": 6011268.0, "step": 2820 }, { "entropy": 2.0600091218948364, "epoch": 0.9298880842659645, "grad_norm": 4.795652389526367, "learning_rate": 1.1978920938212691e-05, "loss": 0.0591, "mean_token_accuracy": 0.984807425737381, "num_tokens": 6022053.0, "step": 2825 }, { "entropy": 2.1145251750946046, "epoch": 0.9315339038841343, "grad_norm": 1.8055132627487183, "learning_rate": 1.1952237760235686e-05, "loss": 0.0611, "mean_token_accuracy": 0.982151460647583, "num_tokens": 6032622.0, "step": 2830 }, { "entropy": 2.1461195945739746, "epoch": 0.9331797235023042, "grad_norm": 3.280885934829712, "learning_rate": 1.192554012388151e-05, "loss": 0.0711, "mean_token_accuracy": 0.9854592263698578, "num_tokens": 6043224.0, "step": 2835 }, { "entropy": 2.1928713798522947, "epoch": 0.934825543120474, "grad_norm": 2.116999864578247, "learning_rate": 1.1898828226874284e-05, "loss": 0.0536, "mean_token_accuracy": 0.9854759395122528, "num_tokens": 6053910.0, "step": 2840 }, { "entropy": 2.153935670852661, "epoch": 0.9364713627386438, "grad_norm": 3.3620643615722656, "learning_rate": 1.1872102267043748e-05, "loss": 0.0525, "mean_token_accuracy": 0.9844566106796264, "num_tokens": 6064731.0, "step": 2845 }, { "entropy": 2.120525050163269, "epoch": 0.9381171823568137, "grad_norm": 2.1633312702178955, "learning_rate": 1.1845362442323784e-05, "loss": 0.0356, "mean_token_accuracy": 0.9903881013393402, "num_tokens": 6075595.0, "step": 2850 }, { "entropy": 2.168933653831482, "epoch": 0.9397630019749835, "grad_norm": 2.078237771987915, "learning_rate": 1.1818608950750967e-05, "loss": 0.045, "mean_token_accuracy": 0.9854483067989349, "num_tokens": 6085983.0, "step": 2855 }, { "entropy": 2.104006254673004, "epoch": 0.9414088215931534, "grad_norm": 1.4718915224075317, "learning_rate": 1.1791841990463083e-05, "loss": 0.0375, "mean_token_accuracy": 0.9886513173580169, "num_tokens": 6096996.0, "step": 2860 }, { "entropy": 2.048595929145813, "epoch": 0.9430546412113232, "grad_norm": 1.9670352935791016, "learning_rate": 1.1765061759697669e-05, "loss": 0.0316, "mean_token_accuracy": 0.9901619255542755, "num_tokens": 6107708.0, "step": 2865 }, { "entropy": 1.970208466053009, "epoch": 0.9447004608294931, "grad_norm": 5.284751892089844, "learning_rate": 1.1738268456790548e-05, "loss": 0.0672, "mean_token_accuracy": 0.981162142753601, "num_tokens": 6118639.0, "step": 2870 }, { "entropy": 1.8950253486633302, "epoch": 0.9463462804476629, "grad_norm": 4.18200159072876, "learning_rate": 1.171146228017435e-05, "loss": 0.0414, "mean_token_accuracy": 0.9900772452354432, "num_tokens": 6129781.0, "step": 2875 }, { "entropy": 1.8563172101974488, "epoch": 0.9479921000658328, "grad_norm": 2.1970016956329346, "learning_rate": 1.1684643428377056e-05, "loss": 0.058, "mean_token_accuracy": 0.9863704919815064, "num_tokens": 6140682.0, "step": 2880 }, { "entropy": 1.8867647051811218, "epoch": 0.9496379196840027, "grad_norm": 1.9754729270935059, "learning_rate": 1.1657812100020507e-05, "loss": 0.0588, "mean_token_accuracy": 0.9856977164745331, "num_tokens": 6151093.0, "step": 2885 }, { "entropy": 1.875185477733612, "epoch": 0.9512837393021725, "grad_norm": 4.4464592933654785, "learning_rate": 1.1630968493818961e-05, "loss": 0.062, "mean_token_accuracy": 0.9841465830802918, "num_tokens": 6161567.0, "step": 2890 }, { "entropy": 1.8699720621109008, "epoch": 0.9529295589203424, "grad_norm": 2.7016501426696777, "learning_rate": 1.1604112808577603e-05, "loss": 0.0727, "mean_token_accuracy": 0.9823096752166748, "num_tokens": 6172041.0, "step": 2895 }, { "entropy": 1.9755351066589355, "epoch": 0.9545753785385122, "grad_norm": 14.661170959472656, "learning_rate": 1.1577245243191068e-05, "loss": 0.0641, "mean_token_accuracy": 0.979709017276764, "num_tokens": 6182650.0, "step": 2900 }, { "entropy": 2.0633350372314454, "epoch": 0.956221198156682, "grad_norm": 3.1461470127105713, "learning_rate": 1.155036599664198e-05, "loss": 0.0693, "mean_token_accuracy": 0.9809099555015564, "num_tokens": 6193204.0, "step": 2905 }, { "entropy": 2.077620244026184, "epoch": 0.9578670177748518, "grad_norm": 1.7489817142486572, "learning_rate": 1.1523475267999477e-05, "loss": 0.0777, "mean_token_accuracy": 0.9819590091705322, "num_tokens": 6203825.0, "step": 2910 }, { "entropy": 2.087475097179413, "epoch": 0.9595128373930217, "grad_norm": 3.104898691177368, "learning_rate": 1.1496573256417733e-05, "loss": 0.0437, "mean_token_accuracy": 0.9875694811344147, "num_tokens": 6214779.0, "step": 2915 }, { "entropy": 2.038085675239563, "epoch": 0.9611586570111915, "grad_norm": 3.818824052810669, "learning_rate": 1.1469660161134481e-05, "loss": 0.0563, "mean_token_accuracy": 0.9853502690792084, "num_tokens": 6225540.0, "step": 2920 }, { "entropy": 1.9863147258758544, "epoch": 0.9628044766293614, "grad_norm": 3.5189995765686035, "learning_rate": 1.1442736181469546e-05, "loss": 0.0534, "mean_token_accuracy": 0.9900292754173279, "num_tokens": 6236269.0, "step": 2925 }, { "entropy": 1.8946941256523133, "epoch": 0.9644502962475313, "grad_norm": 1.297648549079895, "learning_rate": 1.1415801516823358e-05, "loss": 0.0523, "mean_token_accuracy": 0.9893131017684936, "num_tokens": 6247052.0, "step": 2930 }, { "entropy": 1.9294702768325807, "epoch": 0.9660961158657011, "grad_norm": 2.645850419998169, "learning_rate": 1.1388856366675482e-05, "loss": 0.0705, "mean_token_accuracy": 0.9818682849407196, "num_tokens": 6257581.0, "step": 2935 }, { "entropy": 1.9064095735549926, "epoch": 0.967741935483871, "grad_norm": 2.203904151916504, "learning_rate": 1.1361900930583143e-05, "loss": 0.0589, "mean_token_accuracy": 0.9860333859920501, "num_tokens": 6268381.0, "step": 2940 }, { "entropy": 1.8867696881294251, "epoch": 0.9693877551020408, "grad_norm": 4.154993534088135, "learning_rate": 1.1334935408179736e-05, "loss": 0.0495, "mean_token_accuracy": 0.9842896819114685, "num_tokens": 6279423.0, "step": 2945 }, { "entropy": 1.8813761711120605, "epoch": 0.9710335747202107, "grad_norm": 1.7893234491348267, "learning_rate": 1.1307959999173362e-05, "loss": 0.0524, "mean_token_accuracy": 0.9866505920886993, "num_tokens": 6290068.0, "step": 2950 }, { "entropy": 1.915200400352478, "epoch": 0.9726793943383805, "grad_norm": 2.213712692260742, "learning_rate": 1.1280974903345347e-05, "loss": 0.064, "mean_token_accuracy": 0.9851447463035583, "num_tokens": 6300817.0, "step": 2955 }, { "entropy": 1.9231054186820984, "epoch": 0.9743252139565504, "grad_norm": 1.9981787204742432, "learning_rate": 1.1253980320548746e-05, "loss": 0.0434, "mean_token_accuracy": 0.9863059282302856, "num_tokens": 6311420.0, "step": 2960 }, { "entropy": 1.9738001823425293, "epoch": 0.9759710335747203, "grad_norm": 0.5440634489059448, "learning_rate": 1.1226976450706887e-05, "loss": 0.041, "mean_token_accuracy": 0.9861558496952056, "num_tokens": 6321771.0, "step": 2965 }, { "entropy": 1.9686074256896973, "epoch": 0.97761685319289, "grad_norm": 5.0454936027526855, "learning_rate": 1.119996349381187e-05, "loss": 0.0772, "mean_token_accuracy": 0.9804552078247071, "num_tokens": 6332139.0, "step": 2970 }, { "entropy": 1.9591160893440247, "epoch": 0.9792626728110599, "grad_norm": 1.6458666324615479, "learning_rate": 1.1172941649923096e-05, "loss": 0.0463, "mean_token_accuracy": 0.9881701529026031, "num_tokens": 6342546.0, "step": 2975 }, { "entropy": 2.0319183111190795, "epoch": 0.9809084924292297, "grad_norm": 2.887859582901001, "learning_rate": 1.1145911119165789e-05, "loss": 0.0553, "mean_token_accuracy": 0.9828070223331451, "num_tokens": 6353442.0, "step": 2980 }, { "entropy": 2.058086669445038, "epoch": 0.9825543120473996, "grad_norm": 0.9121004343032837, "learning_rate": 1.11188721017295e-05, "loss": 0.0366, "mean_token_accuracy": 0.9888731300830841, "num_tokens": 6363972.0, "step": 2985 }, { "entropy": 1.9924060463905335, "epoch": 0.9842001316655694, "grad_norm": 1.7889047861099243, "learning_rate": 1.1091824797866639e-05, "loss": 0.0469, "mean_token_accuracy": 0.9876609027385712, "num_tokens": 6374648.0, "step": 2990 }, { "entropy": 1.9006078600883485, "epoch": 0.9858459512837393, "grad_norm": 3.3813469409942627, "learning_rate": 1.1064769407890986e-05, "loss": 0.0228, "mean_token_accuracy": 0.9935931503772736, "num_tokens": 6385573.0, "step": 2995 }, { "entropy": 1.8375752925872804, "epoch": 0.9874917709019092, "grad_norm": 4.215015888214111, "learning_rate": 1.1037706132176197e-05, "loss": 0.0313, "mean_token_accuracy": 0.9893752455711364, "num_tokens": 6396486.0, "step": 3000 }, { "entropy": 1.7838311195373535, "epoch": 0.989137590520079, "grad_norm": 1.322409749031067, "learning_rate": 1.1010635171154342e-05, "loss": 0.071, "mean_token_accuracy": 0.9886986196041108, "num_tokens": 6407411.0, "step": 3005 }, { "entropy": 1.7372970938682557, "epoch": 0.9907834101382489, "grad_norm": 2.0234224796295166, "learning_rate": 1.098355672531441e-05, "loss": 0.0549, "mean_token_accuracy": 0.9890697836875916, "num_tokens": 6418480.0, "step": 3010 }, { "entropy": 1.8203161001205443, "epoch": 0.9924292297564187, "grad_norm": 4.083996295928955, "learning_rate": 1.0956470995200816e-05, "loss": 0.0559, "mean_token_accuracy": 0.9861119568347931, "num_tokens": 6429376.0, "step": 3015 }, { "entropy": 1.8980080366134644, "epoch": 0.9940750493745886, "grad_norm": 2.3635177612304688, "learning_rate": 1.0929378181411918e-05, "loss": 0.0407, "mean_token_accuracy": 0.9869163811206818, "num_tokens": 6440084.0, "step": 3020 }, { "entropy": 1.9592399954795838, "epoch": 0.9957208689927584, "grad_norm": 1.469984769821167, "learning_rate": 1.0902278484598549e-05, "loss": 0.046, "mean_token_accuracy": 0.9860125243663788, "num_tokens": 6450632.0, "step": 3025 }, { "entropy": 1.943799901008606, "epoch": 0.9973666886109283, "grad_norm": 2.726168394088745, "learning_rate": 1.0875172105462513e-05, "loss": 0.0617, "mean_token_accuracy": 0.9823821485042572, "num_tokens": 6461324.0, "step": 3030 }, { "entropy": 1.979371726512909, "epoch": 0.999012508229098, "grad_norm": 3.588101625442505, "learning_rate": 1.0848059244755093e-05, "loss": 0.04, "mean_token_accuracy": 0.9917591452598572, "num_tokens": 6472058.0, "step": 3035 }, { "entropy": 2.0093634843826296, "epoch": 1.000658327847268, "grad_norm": 1.96505868434906, "learning_rate": 1.0820940103275594e-05, "loss": 0.0413, "mean_token_accuracy": 0.9876884579658508, "num_tokens": 6482261.0, "step": 3040 }, { "epoch": 1.000658327847268, "eval_entropy": 2.00399693047966, "eval_loss": 0.05508637800812721, "eval_mean_token_accuracy": 0.9851879490383764, "eval_num_tokens": 6482261.0, "eval_runtime": 196.5531, "eval_samples_per_second": 42.365, "eval_steps_per_second": 7.062, "step": 3040 }, { "entropy": 2.0180766105651857, "epoch": 1.0023041474654377, "grad_norm": 1.1969290971755981, "learning_rate": 1.079381488186982e-05, "loss": 0.0215, "mean_token_accuracy": 0.9949158251285553, "num_tokens": 6492816.0, "step": 3045 }, { "entropy": 1.9490231990814209, "epoch": 1.0039499670836076, "grad_norm": 0.8785367012023926, "learning_rate": 1.0766683781428617e-05, "loss": 0.0451, "mean_token_accuracy": 0.9905852675437927, "num_tokens": 6503215.0, "step": 3050 }, { "entropy": 1.9013155460357667, "epoch": 1.0055957867017775, "grad_norm": 2.4878368377685547, "learning_rate": 1.0739547002886361e-05, "loss": 0.0177, "mean_token_accuracy": 0.9950673639774322, "num_tokens": 6514095.0, "step": 3055 }, { "entropy": 1.8226952791213988, "epoch": 1.0072416063199474, "grad_norm": 1.9651074409484863, "learning_rate": 1.0712404747219481e-05, "loss": 0.0253, "mean_token_accuracy": 0.9917102456092834, "num_tokens": 6525039.0, "step": 3060 }, { "entropy": 1.7761475205421449, "epoch": 1.008887425938117, "grad_norm": 0.24293629825115204, "learning_rate": 1.0685257215444975e-05, "loss": 0.0185, "mean_token_accuracy": 0.9937527477741241, "num_tokens": 6535691.0, "step": 3065 }, { "entropy": 1.7371309757232667, "epoch": 1.010533245556287, "grad_norm": 1.073920726776123, "learning_rate": 1.0658104608618917e-05, "loss": 0.0273, "mean_token_accuracy": 0.9902878165245056, "num_tokens": 6546449.0, "step": 3070 }, { "entropy": 1.7515403628349304, "epoch": 1.0121790651744569, "grad_norm": 1.7505379915237427, "learning_rate": 1.0630947127834962e-05, "loss": 0.0213, "mean_token_accuracy": 0.9931905210018158, "num_tokens": 6557347.0, "step": 3075 }, { "entropy": 1.723526620864868, "epoch": 1.0138248847926268, "grad_norm": 3.507692575454712, "learning_rate": 1.0603784974222862e-05, "loss": 0.0429, "mean_token_accuracy": 0.9902272820472717, "num_tokens": 6568263.0, "step": 3080 }, { "entropy": 1.7295622825622559, "epoch": 1.0154707044107967, "grad_norm": 1.3187499046325684, "learning_rate": 1.0576618348946982e-05, "loss": 0.0139, "mean_token_accuracy": 0.9940691888332367, "num_tokens": 6579159.0, "step": 3085 }, { "entropy": 1.7557840943336487, "epoch": 1.0171165240289664, "grad_norm": 0.35923171043395996, "learning_rate": 1.0549447453204793e-05, "loss": 0.0163, "mean_token_accuracy": 0.9933087825775146, "num_tokens": 6589930.0, "step": 3090 }, { "entropy": 1.7555920958518982, "epoch": 1.0187623436471362, "grad_norm": 2.311398983001709, "learning_rate": 1.0522272488225411e-05, "loss": 0.0226, "mean_token_accuracy": 0.9911885142326355, "num_tokens": 6600428.0, "step": 3095 }, { "entropy": 1.6874248266220093, "epoch": 1.0204081632653061, "grad_norm": 2.7232701778411865, "learning_rate": 1.049509365526807e-05, "loss": 0.0322, "mean_token_accuracy": 0.9916168689727783, "num_tokens": 6611008.0, "step": 3100 }, { "entropy": 1.6934003949165344, "epoch": 1.022053982883476, "grad_norm": 3.6890053749084473, "learning_rate": 1.0467911155620664e-05, "loss": 0.0252, "mean_token_accuracy": 0.9931854546070099, "num_tokens": 6621437.0, "step": 3105 }, { "entropy": 1.620060884952545, "epoch": 1.0236998025016457, "grad_norm": 1.3696199655532837, "learning_rate": 1.044072519059824e-05, "loss": 0.055, "mean_token_accuracy": 0.9850696086883545, "num_tokens": 6632746.0, "step": 3110 }, { "entropy": 1.725310730934143, "epoch": 1.0253456221198156, "grad_norm": 0.7018516659736633, "learning_rate": 1.0413535961541499e-05, "loss": 0.0239, "mean_token_accuracy": 0.9919134438037872, "num_tokens": 6643199.0, "step": 3115 }, { "entropy": 1.6991363763809204, "epoch": 1.0269914417379855, "grad_norm": 0.6958704590797424, "learning_rate": 1.0386343669815333e-05, "loss": 0.0304, "mean_token_accuracy": 0.991345739364624, "num_tokens": 6653990.0, "step": 3120 }, { "entropy": 1.7753076672554016, "epoch": 1.0286372613561554, "grad_norm": 3.0339064598083496, "learning_rate": 1.0359148516807302e-05, "loss": 0.0442, "mean_token_accuracy": 0.9869567334651947, "num_tokens": 6664947.0, "step": 3125 }, { "entropy": 1.7934301972389222, "epoch": 1.0302830809743253, "grad_norm": 3.500246047973633, "learning_rate": 1.0331950703926165e-05, "loss": 0.0589, "mean_token_accuracy": 0.9841761589050293, "num_tokens": 6675621.0, "step": 3130 }, { "entropy": 1.8426469326019288, "epoch": 1.031928900592495, "grad_norm": 1.5313292741775513, "learning_rate": 1.0304750432600377e-05, "loss": 0.0261, "mean_token_accuracy": 0.9919860601425171, "num_tokens": 6686232.0, "step": 3135 }, { "entropy": 1.8473934888839723, "epoch": 1.0335747202106649, "grad_norm": 2.6027140617370605, "learning_rate": 1.02775479042766e-05, "loss": 0.0334, "mean_token_accuracy": 0.9884464621543885, "num_tokens": 6696624.0, "step": 3140 }, { "entropy": 1.8324166655540466, "epoch": 1.0352205398288348, "grad_norm": 0.43423017859458923, "learning_rate": 1.0250343320418215e-05, "loss": 0.0232, "mean_token_accuracy": 0.9935197830200195, "num_tokens": 6707078.0, "step": 3145 }, { "entropy": 1.8373860239982605, "epoch": 1.0368663594470047, "grad_norm": 3.9265055656433105, "learning_rate": 1.0223136882503821e-05, "loss": 0.0292, "mean_token_accuracy": 0.9914808392524719, "num_tokens": 6717515.0, "step": 3150 }, { "entropy": 1.850171196460724, "epoch": 1.0385121790651746, "grad_norm": 1.000176191329956, "learning_rate": 1.0195928792025754e-05, "loss": 0.0236, "mean_token_accuracy": 0.9916164934635162, "num_tokens": 6727947.0, "step": 3155 }, { "entropy": 1.8611099600791932, "epoch": 1.0401579986833442, "grad_norm": 0.9246273636817932, "learning_rate": 1.016871925048858e-05, "loss": 0.0208, "mean_token_accuracy": 0.9946603596210479, "num_tokens": 6738673.0, "step": 3160 }, { "entropy": 1.8230043292045592, "epoch": 1.0418038183015141, "grad_norm": 2.162306785583496, "learning_rate": 1.0141508459407622e-05, "loss": 0.0322, "mean_token_accuracy": 0.9894011855125427, "num_tokens": 6749351.0, "step": 3165 }, { "entropy": 1.810911202430725, "epoch": 1.043449637919684, "grad_norm": 0.9168899655342102, "learning_rate": 1.0114296620307455e-05, "loss": 0.0227, "mean_token_accuracy": 0.992879319190979, "num_tokens": 6760013.0, "step": 3170 }, { "entropy": 1.7807167649269104, "epoch": 1.045095457537854, "grad_norm": 3.3147082328796387, "learning_rate": 1.0087083934720407e-05, "loss": 0.0245, "mean_token_accuracy": 0.9914108633995056, "num_tokens": 6770834.0, "step": 3175 }, { "entropy": 1.7569178104400636, "epoch": 1.0467412771560236, "grad_norm": 0.19522562623023987, "learning_rate": 1.0059870604185087e-05, "loss": 0.0244, "mean_token_accuracy": 0.9945703566074371, "num_tokens": 6781679.0, "step": 3180 }, { "entropy": 1.800702166557312, "epoch": 1.0483870967741935, "grad_norm": 0.2634085714817047, "learning_rate": 1.003265683024487e-05, "loss": 0.022, "mean_token_accuracy": 0.9923857629299164, "num_tokens": 6792206.0, "step": 3185 }, { "entropy": 1.78583163022995, "epoch": 1.0500329163923634, "grad_norm": 2.332631826400757, "learning_rate": 1.0005442814446427e-05, "loss": 0.0354, "mean_token_accuracy": 0.9924177765846253, "num_tokens": 6802785.0, "step": 3190 }, { "entropy": 1.7854429602622985, "epoch": 1.0516787360105333, "grad_norm": 3.857931613922119, "learning_rate": 9.97822875833821e-06, "loss": 0.0436, "mean_token_accuracy": 0.9893366694450378, "num_tokens": 6813246.0, "step": 3195 }, { "entropy": 1.7886322617530823, "epoch": 1.0533245556287032, "grad_norm": 0.9116196632385254, "learning_rate": 9.951014863468971e-06, "loss": 0.0308, "mean_token_accuracy": 0.9903899192810058, "num_tokens": 6824016.0, "step": 3200 }, { "entropy": 1.831316077709198, "epoch": 1.0549703752468729, "grad_norm": 4.522629737854004, "learning_rate": 9.92380133138627e-06, "loss": 0.0151, "mean_token_accuracy": 0.9942168056964874, "num_tokens": 6834519.0, "step": 3205 }, { "entropy": 1.8389117002487183, "epoch": 1.0566161948650428, "grad_norm": 3.8139638900756836, "learning_rate": 9.896588363634983e-06, "loss": 0.0348, "mean_token_accuracy": 0.9891720294952393, "num_tokens": 6845234.0, "step": 3210 }, { "entropy": 1.858761990070343, "epoch": 1.0582620144832127, "grad_norm": 1.8006324768066406, "learning_rate": 9.869376161755797e-06, "loss": 0.0308, "mean_token_accuracy": 0.9904006004333497, "num_tokens": 6855897.0, "step": 3215 }, { "entropy": 1.9025790572166443, "epoch": 1.0599078341013826, "grad_norm": 1.6231938600540161, "learning_rate": 9.842164927283734e-06, "loss": 0.0227, "mean_token_accuracy": 0.9929196774959564, "num_tokens": 6866586.0, "step": 3220 }, { "entropy": 1.8303877234458923, "epoch": 1.0615536537195522, "grad_norm": 2.040353298187256, "learning_rate": 9.814954861746661e-06, "loss": 0.0155, "mean_token_accuracy": 0.9953079938888549, "num_tokens": 6877250.0, "step": 3225 }, { "entropy": 1.799347722530365, "epoch": 1.0631994733377221, "grad_norm": 3.7308638095855713, "learning_rate": 9.787746166663765e-06, "loss": 0.0368, "mean_token_accuracy": 0.9898776650428772, "num_tokens": 6887880.0, "step": 3230 }, { "entropy": 1.7600205421447754, "epoch": 1.064845292955892, "grad_norm": 2.044337749481201, "learning_rate": 9.760539043544105e-06, "loss": 0.0464, "mean_token_accuracy": 0.9879779398441315, "num_tokens": 6898466.0, "step": 3235 }, { "entropy": 1.7838748335838317, "epoch": 1.066491112574062, "grad_norm": 0.9999270439147949, "learning_rate": 9.733333693885078e-06, "loss": 0.0282, "mean_token_accuracy": 0.9914038121700287, "num_tokens": 6909129.0, "step": 3240 }, { "entropy": 1.747081458568573, "epoch": 1.0681369321922318, "grad_norm": 0.998917281627655, "learning_rate": 9.706130319170968e-06, "loss": 0.0376, "mean_token_accuracy": 0.9899788439273834, "num_tokens": 6919953.0, "step": 3245 }, { "entropy": 1.8464920043945312, "epoch": 1.0697827518104015, "grad_norm": 0.4222843050956726, "learning_rate": 9.678929120871414e-06, "loss": 0.0173, "mean_token_accuracy": 0.994493979215622, "num_tokens": 6930503.0, "step": 3250 }, { "entropy": 1.8613513350486754, "epoch": 1.0714285714285714, "grad_norm": 5.423548698425293, "learning_rate": 9.651730300439954e-06, "loss": 0.0657, "mean_token_accuracy": 0.9828275620937348, "num_tokens": 6941390.0, "step": 3255 }, { "entropy": 1.809139084815979, "epoch": 1.0730743910467413, "grad_norm": 1.3602713346481323, "learning_rate": 9.62453405931249e-06, "loss": 0.0237, "mean_token_accuracy": 0.9926668286323548, "num_tokens": 6952211.0, "step": 3260 }, { "entropy": 1.8179643034934998, "epoch": 1.0747202106649112, "grad_norm": 1.4231082201004028, "learning_rate": 9.597340598905851e-06, "loss": 0.0248, "mean_token_accuracy": 0.9913124799728393, "num_tokens": 6962822.0, "step": 3265 }, { "entropy": 1.7864566802978517, "epoch": 1.076366030283081, "grad_norm": 1.2587391138076782, "learning_rate": 9.57015012061625e-06, "loss": 0.0269, "mean_token_accuracy": 0.9924269318580627, "num_tokens": 6973461.0, "step": 3270 }, { "entropy": 1.7307344913482665, "epoch": 1.0780118499012508, "grad_norm": 0.7557618021965027, "learning_rate": 9.542962825817827e-06, "loss": 0.0224, "mean_token_accuracy": 0.9931135952472687, "num_tokens": 6984212.0, "step": 3275 }, { "entropy": 1.7689846754074097, "epoch": 1.0796576695194207, "grad_norm": 3.2683939933776855, "learning_rate": 9.515778915861136e-06, "loss": 0.0494, "mean_token_accuracy": 0.9892696917057038, "num_tokens": 6994814.0, "step": 3280 }, { "entropy": 1.862963318824768, "epoch": 1.0813034891375906, "grad_norm": 2.394611120223999, "learning_rate": 9.488598592071668e-06, "loss": 0.0244, "mean_token_accuracy": 0.9937925040721893, "num_tokens": 7005314.0, "step": 3285 }, { "entropy": 1.8323248624801636, "epoch": 1.0829493087557605, "grad_norm": 0.8254493474960327, "learning_rate": 9.461422055748357e-06, "loss": 0.0258, "mean_token_accuracy": 0.9926515042781829, "num_tokens": 7015752.0, "step": 3290 }, { "entropy": 1.8604917049407959, "epoch": 1.0845951283739301, "grad_norm": 0.3408713936805725, "learning_rate": 9.434249508162076e-06, "loss": 0.0322, "mean_token_accuracy": 0.991688358783722, "num_tokens": 7026426.0, "step": 3295 }, { "entropy": 1.8841025471687316, "epoch": 1.0862409479921, "grad_norm": 2.080605983734131, "learning_rate": 9.407081150554172e-06, "loss": 0.0284, "mean_token_accuracy": 0.9886221408843994, "num_tokens": 7036922.0, "step": 3300 }, { "entropy": 1.9134224891662597, "epoch": 1.08788676761027, "grad_norm": 1.2799135446548462, "learning_rate": 9.379917184134949e-06, "loss": 0.0271, "mean_token_accuracy": 0.9915643930435181, "num_tokens": 7047343.0, "step": 3305 }, { "entropy": 1.8705042123794555, "epoch": 1.0895325872284398, "grad_norm": 0.8514411449432373, "learning_rate": 9.352757810082196e-06, "loss": 0.0239, "mean_token_accuracy": 0.9933820009231568, "num_tokens": 7057946.0, "step": 3310 }, { "entropy": 1.8473629593849181, "epoch": 1.0911784068466095, "grad_norm": 0.8632181286811829, "learning_rate": 9.325603229539684e-06, "loss": 0.0216, "mean_token_accuracy": 0.9923954606056213, "num_tokens": 7068260.0, "step": 3315 }, { "entropy": 1.8017066240310669, "epoch": 1.0928242264647794, "grad_norm": 1.6297268867492676, "learning_rate": 9.298453643615692e-06, "loss": 0.0211, "mean_token_accuracy": 0.9935080289840699, "num_tokens": 7078998.0, "step": 3320 }, { "entropy": 1.7489684343338012, "epoch": 1.0944700460829493, "grad_norm": 0.9504618048667908, "learning_rate": 9.2713092533815e-06, "loss": 0.0241, "mean_token_accuracy": 0.9920541942119598, "num_tokens": 7090092.0, "step": 3325 }, { "entropy": 1.753550660610199, "epoch": 1.0961158657011192, "grad_norm": 2.1300246715545654, "learning_rate": 9.244170259869918e-06, "loss": 0.0414, "mean_token_accuracy": 0.9851863026618958, "num_tokens": 7100582.0, "step": 3330 }, { "entropy": 1.7830381631851195, "epoch": 1.097761685319289, "grad_norm": 1.9206792116165161, "learning_rate": 9.217036864073776e-06, "loss": 0.0305, "mean_token_accuracy": 0.989460003376007, "num_tokens": 7110954.0, "step": 3335 }, { "entropy": 1.7082879304885865, "epoch": 1.0994075049374588, "grad_norm": 1.520218849182129, "learning_rate": 9.189909266944459e-06, "loss": 0.0367, "mean_token_accuracy": 0.9915473163127899, "num_tokens": 7121660.0, "step": 3340 }, { "entropy": 1.7186639666557313, "epoch": 1.1010533245556287, "grad_norm": 0.7555062770843506, "learning_rate": 9.162787669390398e-06, "loss": 0.0332, "mean_token_accuracy": 0.9881877601146698, "num_tokens": 7132239.0, "step": 3345 }, { "entropy": 1.7227881669998169, "epoch": 1.1026991441737986, "grad_norm": 1.7483959197998047, "learning_rate": 9.135672272275593e-06, "loss": 0.0258, "mean_token_accuracy": 0.9922352850437164, "num_tokens": 7143078.0, "step": 3350 }, { "entropy": 1.7311092615127563, "epoch": 1.1043449637919684, "grad_norm": 0.43958139419555664, "learning_rate": 9.10856327641813e-06, "loss": 0.0279, "mean_token_accuracy": 0.9921256840229035, "num_tokens": 7153701.0, "step": 3355 }, { "entropy": 1.6882481575012207, "epoch": 1.1059907834101383, "grad_norm": 2.0378565788269043, "learning_rate": 9.081460882588668e-06, "loss": 0.0168, "mean_token_accuracy": 0.995065587759018, "num_tokens": 7164253.0, "step": 3360 }, { "entropy": 1.6925435185432434, "epoch": 1.107636603028308, "grad_norm": 0.6830073595046997, "learning_rate": 9.054365291508998e-06, "loss": 0.0258, "mean_token_accuracy": 0.9935117542743683, "num_tokens": 7175017.0, "step": 3365 }, { "entropy": 1.7125192880630493, "epoch": 1.109282422646478, "grad_norm": 0.8072521090507507, "learning_rate": 9.027276703850505e-06, "loss": 0.0279, "mean_token_accuracy": 0.9914268732070923, "num_tokens": 7185582.0, "step": 3370 }, { "entropy": 1.6770400524139404, "epoch": 1.1109282422646478, "grad_norm": 3.0566606521606445, "learning_rate": 9.000195320232724e-06, "loss": 0.0457, "mean_token_accuracy": 0.987758994102478, "num_tokens": 7196254.0, "step": 3375 }, { "entropy": 1.7625356912612915, "epoch": 1.1125740618828177, "grad_norm": 2.011699914932251, "learning_rate": 8.973121341221823e-06, "loss": 0.0248, "mean_token_accuracy": 0.9929955065250397, "num_tokens": 7206923.0, "step": 3380 }, { "entropy": 1.7391006708145142, "epoch": 1.1142198815009876, "grad_norm": 2.011082887649536, "learning_rate": 8.946054967329142e-06, "loss": 0.0243, "mean_token_accuracy": 0.9917931973934173, "num_tokens": 7217251.0, "step": 3385 }, { "entropy": 1.732879626750946, "epoch": 1.1158657011191573, "grad_norm": 1.593976378440857, "learning_rate": 8.918996399009689e-06, "loss": 0.0246, "mean_token_accuracy": 0.9882166802883148, "num_tokens": 7227764.0, "step": 3390 }, { "entropy": 1.6700308084487916, "epoch": 1.1175115207373272, "grad_norm": 1.129323124885559, "learning_rate": 8.891945836660673e-06, "loss": 0.0283, "mean_token_accuracy": 0.9915276169776917, "num_tokens": 7238505.0, "step": 3395 }, { "entropy": 1.688683032989502, "epoch": 1.119157340355497, "grad_norm": 2.0232725143432617, "learning_rate": 8.864903480619996e-06, "loss": 0.0405, "mean_token_accuracy": 0.9898577332496643, "num_tokens": 7249030.0, "step": 3400 }, { "entropy": 1.6767473936080932, "epoch": 1.120803159973667, "grad_norm": 1.5392571687698364, "learning_rate": 8.837869531164792e-06, "loss": 0.0253, "mean_token_accuracy": 0.9922107934951783, "num_tokens": 7259760.0, "step": 3405 }, { "entropy": 1.6974989771842957, "epoch": 1.1224489795918366, "grad_norm": 1.918980360031128, "learning_rate": 8.810844188509946e-06, "loss": 0.0405, "mean_token_accuracy": 0.9870936632156372, "num_tokens": 7270389.0, "step": 3410 }, { "entropy": 1.6960701942443848, "epoch": 1.1240947992100065, "grad_norm": 1.4752129316329956, "learning_rate": 8.783827652806577e-06, "loss": 0.0225, "mean_token_accuracy": 0.9912503600120545, "num_tokens": 7281116.0, "step": 3415 }, { "entropy": 1.6978899121284485, "epoch": 1.1257406188281764, "grad_norm": 2.9839565753936768, "learning_rate": 8.756820124140602e-06, "loss": 0.0327, "mean_token_accuracy": 0.9897141098976135, "num_tokens": 7291516.0, "step": 3420 }, { "entropy": 1.6348261475563048, "epoch": 1.1273864384463463, "grad_norm": 2.958974599838257, "learning_rate": 8.729821802531213e-06, "loss": 0.0287, "mean_token_accuracy": 0.9899660766124725, "num_tokens": 7302127.0, "step": 3425 }, { "entropy": 1.671372151374817, "epoch": 1.129032258064516, "grad_norm": 1.5821384191513062, "learning_rate": 8.70283288792943e-06, "loss": 0.0213, "mean_token_accuracy": 0.9928022623062134, "num_tokens": 7312938.0, "step": 3430 }, { "entropy": 1.7028130054473878, "epoch": 1.130678077682686, "grad_norm": 1.062685489654541, "learning_rate": 8.67585358021659e-06, "loss": 0.037, "mean_token_accuracy": 0.9904503941535949, "num_tokens": 7323516.0, "step": 3435 }, { "entropy": 1.698065197467804, "epoch": 1.1323238973008558, "grad_norm": 3.0057718753814697, "learning_rate": 8.648884079202896e-06, "loss": 0.0186, "mean_token_accuracy": 0.9931861937046051, "num_tokens": 7333911.0, "step": 3440 }, { "entropy": 1.6505720615386963, "epoch": 1.1339697169190257, "grad_norm": 1.6598554849624634, "learning_rate": 8.6219245846259e-06, "loss": 0.0174, "mean_token_accuracy": 0.9931141972541809, "num_tokens": 7344558.0, "step": 3445 }, { "entropy": 1.6310924172401429, "epoch": 1.1356155365371956, "grad_norm": 2.144071578979492, "learning_rate": 8.594975296149076e-06, "loss": 0.0195, "mean_token_accuracy": 0.9929544270038605, "num_tokens": 7355291.0, "step": 3450 }, { "entropy": 1.600458061695099, "epoch": 1.1372613561553653, "grad_norm": 0.6878706216812134, "learning_rate": 8.568036413360283e-06, "loss": 0.0291, "mean_token_accuracy": 0.9917424976825714, "num_tokens": 7365889.0, "step": 3455 }, { "entropy": 1.6151419758796692, "epoch": 1.1389071757735352, "grad_norm": 1.7664144039154053, "learning_rate": 8.541108135770327e-06, "loss": 0.0225, "mean_token_accuracy": 0.9897194743156433, "num_tokens": 7376467.0, "step": 3460 }, { "entropy": 1.6531212329864502, "epoch": 1.140552995391705, "grad_norm": 3.0760936737060547, "learning_rate": 8.514190662811477e-06, "loss": 0.0251, "mean_token_accuracy": 0.9931772351264954, "num_tokens": 7387048.0, "step": 3465 }, { "entropy": 1.6679503798484803, "epoch": 1.142198815009875, "grad_norm": 2.6920888423919678, "learning_rate": 8.48728419383597e-06, "loss": 0.0298, "mean_token_accuracy": 0.9894031941890716, "num_tokens": 7397403.0, "step": 3470 }, { "entropy": 1.5688734650611877, "epoch": 1.1438446346280449, "grad_norm": 2.5779929161071777, "learning_rate": 8.46038892811456e-06, "loss": 0.0249, "mean_token_accuracy": 0.9909287571907044, "num_tokens": 7408276.0, "step": 3475 }, { "entropy": 1.6224292755126952, "epoch": 1.1454904542462145, "grad_norm": 0.8870710134506226, "learning_rate": 8.433505064835012e-06, "loss": 0.0293, "mean_token_accuracy": 0.9906193792819977, "num_tokens": 7418781.0, "step": 3480 }, { "entropy": 1.644513690471649, "epoch": 1.1471362738643844, "grad_norm": 1.0466939210891724, "learning_rate": 8.406632803100665e-06, "loss": 0.034, "mean_token_accuracy": 0.9898303151130676, "num_tokens": 7429487.0, "step": 3485 }, { "entropy": 1.6431127905845642, "epoch": 1.1487820934825543, "grad_norm": 2.5962440967559814, "learning_rate": 8.379772341928916e-06, "loss": 0.0348, "mean_token_accuracy": 0.9878771305084229, "num_tokens": 7439874.0, "step": 3490 }, { "entropy": 1.6485112071037293, "epoch": 1.1504279131007242, "grad_norm": 5.498775005340576, "learning_rate": 8.352923880249784e-06, "loss": 0.0329, "mean_token_accuracy": 0.9888358414173126, "num_tokens": 7450555.0, "step": 3495 }, { "entropy": 1.648984158039093, "epoch": 1.1520737327188941, "grad_norm": 1.443813681602478, "learning_rate": 8.326087616904401e-06, "loss": 0.038, "mean_token_accuracy": 0.9887622356414795, "num_tokens": 7461255.0, "step": 3500 }, { "entropy": 1.6465593934059144, "epoch": 1.1537195523370638, "grad_norm": 1.363283395767212, "learning_rate": 8.299263750643577e-06, "loss": 0.0244, "mean_token_accuracy": 0.9922839045524597, "num_tokens": 7472013.0, "step": 3505 }, { "entropy": 1.6541425347328187, "epoch": 1.1553653719552337, "grad_norm": 2.518134355545044, "learning_rate": 8.272452480126292e-06, "loss": 0.0341, "mean_token_accuracy": 0.9866783797740937, "num_tokens": 7483066.0, "step": 3510 }, { "entropy": 1.6475726962089539, "epoch": 1.1570111915734036, "grad_norm": 2.8579108715057373, "learning_rate": 8.245654003918253e-06, "loss": 0.0305, "mean_token_accuracy": 0.9904314935207367, "num_tokens": 7493486.0, "step": 3515 }, { "entropy": 1.6358031511306763, "epoch": 1.1586570111915735, "grad_norm": 2.28322172164917, "learning_rate": 8.218868520490404e-06, "loss": 0.0154, "mean_token_accuracy": 0.9955400943756103, "num_tokens": 7504061.0, "step": 3520 }, { "entropy": 1.6113605737686156, "epoch": 1.1603028308097432, "grad_norm": 1.8561283349990845, "learning_rate": 8.192096228217464e-06, "loss": 0.0138, "mean_token_accuracy": 0.9960749268531799, "num_tokens": 7514617.0, "step": 3525 }, { "entropy": 1.5644055128097534, "epoch": 1.161948650427913, "grad_norm": 2.0931262969970703, "learning_rate": 8.165337325376467e-06, "loss": 0.0282, "mean_token_accuracy": 0.9914573907852173, "num_tokens": 7525370.0, "step": 3530 }, { "entropy": 1.502221119403839, "epoch": 1.163594470046083, "grad_norm": 2.6928858757019043, "learning_rate": 8.138592010145273e-06, "loss": 0.0152, "mean_token_accuracy": 0.9963742852210998, "num_tokens": 7536226.0, "step": 3535 }, { "entropy": 1.5423286318778993, "epoch": 1.1652402896642529, "grad_norm": 2.7901740074157715, "learning_rate": 8.111860480601117e-06, "loss": 0.0464, "mean_token_accuracy": 0.9899639308452606, "num_tokens": 7546765.0, "step": 3540 }, { "entropy": 1.6044429183006286, "epoch": 1.1668861092824225, "grad_norm": 0.5412013530731201, "learning_rate": 8.085142934719131e-06, "loss": 0.015, "mean_token_accuracy": 0.9947813093662262, "num_tokens": 7557291.0, "step": 3545 }, { "entropy": 1.5845131874084473, "epoch": 1.1685319289005924, "grad_norm": 1.3245034217834473, "learning_rate": 8.058439570370896e-06, "loss": 0.0269, "mean_token_accuracy": 0.9917524337768555, "num_tokens": 7568110.0, "step": 3550 }, { "entropy": 1.576395034790039, "epoch": 1.1701777485187623, "grad_norm": 1.6202564239501953, "learning_rate": 8.031750585322948e-06, "loss": 0.0353, "mean_token_accuracy": 0.9901875615119934, "num_tokens": 7578544.0, "step": 3555 }, { "entropy": 1.6079319357872008, "epoch": 1.1718235681369322, "grad_norm": 1.0341594219207764, "learning_rate": 8.005076177235337e-06, "loss": 0.0217, "mean_token_accuracy": 0.9924035966396332, "num_tokens": 7589338.0, "step": 3560 }, { "entropy": 1.5887961030006408, "epoch": 1.1734693877551021, "grad_norm": 2.502521276473999, "learning_rate": 7.978416543660157e-06, "loss": 0.042, "mean_token_accuracy": 0.9888204395771026, "num_tokens": 7599872.0, "step": 3565 }, { "entropy": 1.5681854605674743, "epoch": 1.1751152073732718, "grad_norm": 0.6041249632835388, "learning_rate": 7.951771882040082e-06, "loss": 0.0141, "mean_token_accuracy": 0.9941197037696838, "num_tokens": 7610467.0, "step": 3570 }, { "entropy": 1.5940211772918702, "epoch": 1.1767610269914417, "grad_norm": 2.8404202461242676, "learning_rate": 7.92514238970689e-06, "loss": 0.0284, "mean_token_accuracy": 0.9908346593379974, "num_tokens": 7621740.0, "step": 3575 }, { "entropy": 1.6273322224617004, "epoch": 1.1784068466096116, "grad_norm": 2.6306891441345215, "learning_rate": 7.898528263880032e-06, "loss": 0.0421, "mean_token_accuracy": 0.9906798958778381, "num_tokens": 7632253.0, "step": 3580 }, { "entropy": 1.6058209180831908, "epoch": 1.1800526662277815, "grad_norm": 2.210401773452759, "learning_rate": 7.871929701665147e-06, "loss": 0.02, "mean_token_accuracy": 0.9937019526958466, "num_tokens": 7642945.0, "step": 3585 }, { "entropy": 1.6328086972236633, "epoch": 1.1816984858459514, "grad_norm": 1.6380540132522583, "learning_rate": 7.8453469000526e-06, "loss": 0.0288, "mean_token_accuracy": 0.9929502904415131, "num_tokens": 7653553.0, "step": 3590 }, { "entropy": 1.6912413835525513, "epoch": 1.183344305464121, "grad_norm": 4.160943508148193, "learning_rate": 7.818780055916052e-06, "loss": 0.0195, "mean_token_accuracy": 0.9919560194015503, "num_tokens": 7663891.0, "step": 3595 }, { "entropy": 1.6919382691383362, "epoch": 1.184990125082291, "grad_norm": 0.695886492729187, "learning_rate": 7.792229366010959e-06, "loss": 0.0248, "mean_token_accuracy": 0.9903152227401734, "num_tokens": 7674488.0, "step": 3600 }, { "entropy": 1.6337382435798644, "epoch": 1.1866359447004609, "grad_norm": 1.5294524431228638, "learning_rate": 7.765695026973155e-06, "loss": 0.0196, "mean_token_accuracy": 0.9943537652492523, "num_tokens": 7685143.0, "step": 3605 }, { "entropy": 1.6397574901580811, "epoch": 1.1882817643186308, "grad_norm": 2.282261610031128, "learning_rate": 7.73917723531737e-06, "loss": 0.0403, "mean_token_accuracy": 0.9924940526485443, "num_tokens": 7695569.0, "step": 3610 }, { "entropy": 1.5973503470420838, "epoch": 1.1899275839368006, "grad_norm": 5.261276721954346, "learning_rate": 7.71267618743579e-06, "loss": 0.0227, "mean_token_accuracy": 0.9901642620563507, "num_tokens": 7706303.0, "step": 3615 }, { "entropy": 1.6234167337417602, "epoch": 1.1915734035549703, "grad_norm": 2.048280954360962, "learning_rate": 7.686192079596586e-06, "loss": 0.0197, "mean_token_accuracy": 0.9935329794883728, "num_tokens": 7716665.0, "step": 3620 }, { "entropy": 1.6198643565177917, "epoch": 1.1932192231731402, "grad_norm": 1.0738399028778076, "learning_rate": 7.659725107942484e-06, "loss": 0.0267, "mean_token_accuracy": 0.9920990586280822, "num_tokens": 7727458.0, "step": 3625 }, { "entropy": 1.658357810974121, "epoch": 1.1948650427913101, "grad_norm": 1.3478312492370605, "learning_rate": 7.633275468489278e-06, "loss": 0.0254, "mean_token_accuracy": 0.9936649382114411, "num_tokens": 7738325.0, "step": 3630 }, { "entropy": 1.7477381587028504, "epoch": 1.19651086240948, "grad_norm": 2.4919769763946533, "learning_rate": 7.606843357124426e-06, "loss": 0.0399, "mean_token_accuracy": 0.9883269786834716, "num_tokens": 7748772.0, "step": 3635 }, { "entropy": 1.7540024757385253, "epoch": 1.1981566820276497, "grad_norm": 1.3142441511154175, "learning_rate": 7.5804289696055445e-06, "loss": 0.0355, "mean_token_accuracy": 0.9913724660873413, "num_tokens": 7759350.0, "step": 3640 }, { "entropy": 1.7711892247200012, "epoch": 1.1998025016458196, "grad_norm": 2.53527569770813, "learning_rate": 7.554032501559002e-06, "loss": 0.0313, "mean_token_accuracy": 0.9935957908630371, "num_tokens": 7770189.0, "step": 3645 }, { "epoch": 1.2007899934167214, "eval_entropy": 1.8328242014223972, "eval_loss": 0.055253952741622925, "eval_mean_token_accuracy": 0.9852129074527483, "eval_num_tokens": 7776505.0, "eval_runtime": 197.2997, "eval_samples_per_second": 42.205, "eval_steps_per_second": 7.035, "step": 3648 }, { "entropy": 1.820277488231659, "epoch": 1.2014483212639895, "grad_norm": 1.2625054121017456, "learning_rate": 7.52765414847846e-06, "loss": 0.0367, "mean_token_accuracy": 0.9870195090770721, "num_tokens": 7780633.0, "step": 3650 }, { "entropy": 1.850312602519989, "epoch": 1.2030941408821594, "grad_norm": 2.3248748779296875, "learning_rate": 7.5012941057234015e-06, "loss": 0.0348, "mean_token_accuracy": 0.988330626487732, "num_tokens": 7791063.0, "step": 3655 }, { "entropy": 1.7486730933189392, "epoch": 1.204739960500329, "grad_norm": 2.7954654693603516, "learning_rate": 7.4749525685177145e-06, "loss": 0.0345, "mean_token_accuracy": 0.9921814143657685, "num_tokens": 7801794.0, "step": 3660 }, { "entropy": 1.7875786900520325, "epoch": 1.206385780118499, "grad_norm": 1.7330853939056396, "learning_rate": 7.44862973194823e-06, "loss": 0.0377, "mean_token_accuracy": 0.9861465573310852, "num_tokens": 7812680.0, "step": 3665 }, { "entropy": 1.7702386498451232, "epoch": 1.2080315997366688, "grad_norm": 1.6117557287216187, "learning_rate": 7.422325790963286e-06, "loss": 0.0228, "mean_token_accuracy": 0.9921964347362519, "num_tokens": 7823265.0, "step": 3670 }, { "entropy": 1.750121569633484, "epoch": 1.2096774193548387, "grad_norm": 1.6110137701034546, "learning_rate": 7.39604094037127e-06, "loss": 0.0226, "mean_token_accuracy": 0.9932965815067292, "num_tokens": 7833616.0, "step": 3675 }, { "entropy": 1.7086394667625426, "epoch": 1.2113232389730086, "grad_norm": 5.368286609649658, "learning_rate": 7.369775374839196e-06, "loss": 0.0257, "mean_token_accuracy": 0.9906518340110779, "num_tokens": 7844464.0, "step": 3680 }, { "entropy": 1.7639584064483642, "epoch": 1.2129690585911783, "grad_norm": 0.626293420791626, "learning_rate": 7.343529288891239e-06, "loss": 0.0251, "mean_token_accuracy": 0.9907599687576294, "num_tokens": 7855198.0, "step": 3685 }, { "entropy": 1.7602010250091553, "epoch": 1.2146148782093482, "grad_norm": 0.723297119140625, "learning_rate": 7.317302876907318e-06, "loss": 0.0312, "mean_token_accuracy": 0.9907152950763702, "num_tokens": 7865513.0, "step": 3690 }, { "entropy": 1.780686092376709, "epoch": 1.2162606978275181, "grad_norm": 0.540903627872467, "learning_rate": 7.29109633312164e-06, "loss": 0.0224, "mean_token_accuracy": 0.9916972815990448, "num_tokens": 7875954.0, "step": 3695 }, { "entropy": 1.7540069580078126, "epoch": 1.217906517445688, "grad_norm": 1.341984510421753, "learning_rate": 7.2649098516212715e-06, "loss": 0.0382, "mean_token_accuracy": 0.9883949160575867, "num_tokens": 7886845.0, "step": 3700 }, { "entropy": 1.759465503692627, "epoch": 1.219552337063858, "grad_norm": 1.8915423154830933, "learning_rate": 7.2387436263446885e-06, "loss": 0.0222, "mean_token_accuracy": 0.9931784868240356, "num_tokens": 7897311.0, "step": 3705 }, { "entropy": 1.768437671661377, "epoch": 1.2211981566820276, "grad_norm": 1.4099783897399902, "learning_rate": 7.2125978510803565e-06, "loss": 0.0268, "mean_token_accuracy": 0.9915161907672883, "num_tokens": 7907999.0, "step": 3710 }, { "entropy": 1.7534563660621643, "epoch": 1.2228439763001975, "grad_norm": 1.8560601472854614, "learning_rate": 7.1864727194652874e-06, "loss": 0.0295, "mean_token_accuracy": 0.9926966905593873, "num_tokens": 7918701.0, "step": 3715 }, { "entropy": 1.7628652215003968, "epoch": 1.2244897959183674, "grad_norm": 1.2373379468917847, "learning_rate": 7.1603684249835966e-06, "loss": 0.0226, "mean_token_accuracy": 0.9931581020355225, "num_tokens": 7929045.0, "step": 3720 }, { "entropy": 1.7709421277046205, "epoch": 1.2261356155365373, "grad_norm": 1.8868582248687744, "learning_rate": 7.134285160965091e-06, "loss": 0.0213, "mean_token_accuracy": 0.9951976418495179, "num_tokens": 7939549.0, "step": 3725 }, { "entropy": 1.7510177731513976, "epoch": 1.2277814351547072, "grad_norm": 1.6280816793441772, "learning_rate": 7.108223120583806e-06, "loss": 0.0431, "mean_token_accuracy": 0.9894875645637512, "num_tokens": 7950189.0, "step": 3730 }, { "entropy": 1.7418721914291382, "epoch": 1.2294272547728768, "grad_norm": 2.297192335128784, "learning_rate": 7.0821824968566186e-06, "loss": 0.0192, "mean_token_accuracy": 0.9930408895015717, "num_tokens": 7960790.0, "step": 3735 }, { "entropy": 1.7448366165161133, "epoch": 1.2310730743910467, "grad_norm": 2.935821294784546, "learning_rate": 7.056163482641769e-06, "loss": 0.027, "mean_token_accuracy": 0.9896710455417633, "num_tokens": 7971434.0, "step": 3740 }, { "entropy": 1.6850064516067504, "epoch": 1.2327188940092166, "grad_norm": 1.6496083736419678, "learning_rate": 7.030166270637475e-06, "loss": 0.0238, "mean_token_accuracy": 0.9923489093780518, "num_tokens": 7981902.0, "step": 3745 }, { "entropy": 1.6959288597106934, "epoch": 1.2343647136273865, "grad_norm": 4.362974643707275, "learning_rate": 7.004191053380469e-06, "loss": 0.028, "mean_token_accuracy": 0.9899141073226929, "num_tokens": 7992608.0, "step": 3750 }, { "entropy": 1.7090837240219117, "epoch": 1.2360105332455562, "grad_norm": 1.9156920909881592, "learning_rate": 6.978238023244608e-06, "loss": 0.0274, "mean_token_accuracy": 0.9905971348285675, "num_tokens": 8003032.0, "step": 3755 }, { "entropy": 1.6994237184524537, "epoch": 1.237656352863726, "grad_norm": 1.834991455078125, "learning_rate": 6.952307372439411e-06, "loss": 0.0264, "mean_token_accuracy": 0.9915358006954194, "num_tokens": 8013591.0, "step": 3760 }, { "entropy": 1.6717028617858887, "epoch": 1.239302172481896, "grad_norm": 1.8597207069396973, "learning_rate": 6.926399293008668e-06, "loss": 0.0233, "mean_token_accuracy": 0.9935126841068268, "num_tokens": 8024009.0, "step": 3765 }, { "entropy": 1.6924420833587646, "epoch": 1.240947992100066, "grad_norm": 2.2939343452453613, "learning_rate": 6.900513976829e-06, "loss": 0.0323, "mean_token_accuracy": 0.9890012562274932, "num_tokens": 8034690.0, "step": 3770 }, { "entropy": 1.746755886077881, "epoch": 1.2425938117182356, "grad_norm": 2.126143455505371, "learning_rate": 6.874651615608441e-06, "loss": 0.0174, "mean_token_accuracy": 0.9934778928756713, "num_tokens": 8045114.0, "step": 3775 }, { "entropy": 1.6977684855461121, "epoch": 1.2442396313364055, "grad_norm": 0.6126706600189209, "learning_rate": 6.848812400885022e-06, "loss": 0.0192, "mean_token_accuracy": 0.9939151763916015, "num_tokens": 8055532.0, "step": 3780 }, { "entropy": 1.6669098496437074, "epoch": 1.2458854509545754, "grad_norm": 0.8724939227104187, "learning_rate": 6.822996524025343e-06, "loss": 0.0197, "mean_token_accuracy": 0.9912198483943939, "num_tokens": 8066233.0, "step": 3785 }, { "entropy": 1.6577038049697876, "epoch": 1.2475312705727453, "grad_norm": 0.9917463064193726, "learning_rate": 6.7972041762231735e-06, "loss": 0.025, "mean_token_accuracy": 0.990100759267807, "num_tokens": 8076870.0, "step": 3790 }, { "entropy": 1.6371492862701416, "epoch": 1.2491770901909152, "grad_norm": 3.3986692428588867, "learning_rate": 6.771435548498013e-06, "loss": 0.035, "mean_token_accuracy": 0.9852859020233155, "num_tokens": 8087788.0, "step": 3795 }, { "entropy": 1.6349127650260926, "epoch": 1.2508229098090848, "grad_norm": 0.3032724857330322, "learning_rate": 6.745690831693701e-06, "loss": 0.0251, "mean_token_accuracy": 0.993361896276474, "num_tokens": 8098543.0, "step": 3800 }, { "entropy": 1.6615733742713927, "epoch": 1.2524687294272547, "grad_norm": 1.261225700378418, "learning_rate": 6.719970216476982e-06, "loss": 0.0253, "mean_token_accuracy": 0.9910578489303589, "num_tokens": 8109340.0, "step": 3805 }, { "entropy": 1.680140745639801, "epoch": 1.2541145490454246, "grad_norm": 1.3544028997421265, "learning_rate": 6.694273893336112e-06, "loss": 0.0229, "mean_token_accuracy": 0.9918768882751465, "num_tokens": 8119963.0, "step": 3810 }, { "entropy": 1.6792070388793945, "epoch": 1.2557603686635945, "grad_norm": 1.9016791582107544, "learning_rate": 6.668602052579425e-06, "loss": 0.0263, "mean_token_accuracy": 0.9923485815525055, "num_tokens": 8130505.0, "step": 3815 }, { "entropy": 1.7175181269645692, "epoch": 1.2574061882817644, "grad_norm": 0.3682861924171448, "learning_rate": 6.6429548843339554e-06, "loss": 0.0225, "mean_token_accuracy": 0.991203784942627, "num_tokens": 8141063.0, "step": 3820 }, { "entropy": 1.6892056941986084, "epoch": 1.259052007899934, "grad_norm": 5.687158107757568, "learning_rate": 6.617332578543991e-06, "loss": 0.0289, "mean_token_accuracy": 0.9894017159938813, "num_tokens": 8152016.0, "step": 3825 }, { "entropy": 1.7227007865905761, "epoch": 1.260697827518104, "grad_norm": 2.130305290222168, "learning_rate": 6.591735324969703e-06, "loss": 0.0423, "mean_token_accuracy": 0.9899180591106415, "num_tokens": 8162786.0, "step": 3830 }, { "entropy": 1.766884195804596, "epoch": 1.262343647136274, "grad_norm": 2.274033546447754, "learning_rate": 6.566163313185725e-06, "loss": 0.0261, "mean_token_accuracy": 0.9887346982955932, "num_tokens": 8173362.0, "step": 3835 }, { "entropy": 1.8262192487716675, "epoch": 1.2639894667544438, "grad_norm": 0.5273262858390808, "learning_rate": 6.540616732579732e-06, "loss": 0.0387, "mean_token_accuracy": 0.9902793288230896, "num_tokens": 8183767.0, "step": 3840 }, { "entropy": 1.8414955258369445, "epoch": 1.2656352863726137, "grad_norm": 0.5974478721618652, "learning_rate": 6.515095772351072e-06, "loss": 0.0124, "mean_token_accuracy": 0.99626624584198, "num_tokens": 8194098.0, "step": 3845 }, { "entropy": 1.8668495297431946, "epoch": 1.2672811059907834, "grad_norm": 1.7619863748550415, "learning_rate": 6.489600621509338e-06, "loss": 0.0206, "mean_token_accuracy": 0.9931797504425048, "num_tokens": 8204769.0, "step": 3850 }, { "entropy": 1.8715619444847107, "epoch": 1.2689269256089533, "grad_norm": 2.3929049968719482, "learning_rate": 6.464131468872978e-06, "loss": 0.0168, "mean_token_accuracy": 0.9927472591400146, "num_tokens": 8215368.0, "step": 3855 }, { "entropy": 1.8072044491767882, "epoch": 1.2705727452271232, "grad_norm": 2.2075207233428955, "learning_rate": 6.4386885030679e-06, "loss": 0.0332, "mean_token_accuracy": 0.992115718126297, "num_tokens": 8226508.0, "step": 3860 }, { "entropy": 1.905867862701416, "epoch": 1.2722185648452928, "grad_norm": 1.7269535064697266, "learning_rate": 6.413271912526071e-06, "loss": 0.0218, "mean_token_accuracy": 0.993303781747818, "num_tokens": 8237461.0, "step": 3865 }, { "entropy": 1.8554383277893067, "epoch": 1.2738643844634627, "grad_norm": 1.8244965076446533, "learning_rate": 6.3878818854841095e-06, "loss": 0.0202, "mean_token_accuracy": 0.9931906521320343, "num_tokens": 8248162.0, "step": 3870 }, { "entropy": 1.903454637527466, "epoch": 1.2755102040816326, "grad_norm": 1.2271250486373901, "learning_rate": 6.36251860998192e-06, "loss": 0.0204, "mean_token_accuracy": 0.9948872745037078, "num_tokens": 8258852.0, "step": 3875 }, { "entropy": 1.8506133675575256, "epoch": 1.2771560236998025, "grad_norm": 1.973997712135315, "learning_rate": 6.337182273861273e-06, "loss": 0.03, "mean_token_accuracy": 0.9918236672878266, "num_tokens": 8269756.0, "step": 3880 }, { "entropy": 1.8246717214584351, "epoch": 1.2788018433179724, "grad_norm": 1.5060253143310547, "learning_rate": 6.311873064764429e-06, "loss": 0.0203, "mean_token_accuracy": 0.992903220653534, "num_tokens": 8280446.0, "step": 3885 }, { "entropy": 1.7956886053085328, "epoch": 1.280447662936142, "grad_norm": 2.8490633964538574, "learning_rate": 6.2865911701327445e-06, "loss": 0.0434, "mean_token_accuracy": 0.9881740510463715, "num_tokens": 8291519.0, "step": 3890 }, { "entropy": 1.7901652932167054, "epoch": 1.282093482554312, "grad_norm": 2.23622727394104, "learning_rate": 6.261336777205278e-06, "loss": 0.0304, "mean_token_accuracy": 0.9898859024047851, "num_tokens": 8302444.0, "step": 3895 }, { "entropy": 1.795331597328186, "epoch": 1.2837393021724819, "grad_norm": 2.6051158905029297, "learning_rate": 6.236110073017417e-06, "loss": 0.0215, "mean_token_accuracy": 0.9921227872371674, "num_tokens": 8313239.0, "step": 3900 }, { "entropy": 1.8181136965751648, "epoch": 1.2853851217906518, "grad_norm": 1.3954198360443115, "learning_rate": 6.210911244399477e-06, "loss": 0.0241, "mean_token_accuracy": 0.9936698138713836, "num_tokens": 8323898.0, "step": 3905 }, { "entropy": 1.801988172531128, "epoch": 1.2870309414088217, "grad_norm": 1.1723895072937012, "learning_rate": 6.185740477975335e-06, "loss": 0.0183, "mean_token_accuracy": 0.9923419177532196, "num_tokens": 8334473.0, "step": 3910 }, { "entropy": 1.783213496208191, "epoch": 1.2886767610269914, "grad_norm": 4.490993499755859, "learning_rate": 6.160597960161024e-06, "loss": 0.0452, "mean_token_accuracy": 0.9866098284721374, "num_tokens": 8345014.0, "step": 3915 }, { "entropy": 1.73745299577713, "epoch": 1.2903225806451613, "grad_norm": 2.8388960361480713, "learning_rate": 6.135483877163383e-06, "loss": 0.0243, "mean_token_accuracy": 0.9913448512554168, "num_tokens": 8355724.0, "step": 3920 }, { "entropy": 1.7496454119682312, "epoch": 1.2919684002633312, "grad_norm": 1.7147318124771118, "learning_rate": 6.1103984149786444e-06, "loss": 0.0262, "mean_token_accuracy": 0.9924720883369446, "num_tokens": 8366189.0, "step": 3925 }, { "entropy": 1.7078065276145935, "epoch": 1.293614219881501, "grad_norm": 2.615743398666382, "learning_rate": 6.085341759391089e-06, "loss": 0.0138, "mean_token_accuracy": 0.9948653101921081, "num_tokens": 8376886.0, "step": 3930 }, { "entropy": 1.7135319232940673, "epoch": 1.295260039499671, "grad_norm": 0.48522359132766724, "learning_rate": 6.060314095971641e-06, "loss": 0.0258, "mean_token_accuracy": 0.9912464320659637, "num_tokens": 8387635.0, "step": 3935 }, { "entropy": 1.7192439675331115, "epoch": 1.2969058591178406, "grad_norm": 0.810149073600769, "learning_rate": 6.035315610076518e-06, "loss": 0.0246, "mean_token_accuracy": 0.9908073008060455, "num_tokens": 8398117.0, "step": 3940 }, { "entropy": 1.6878206253051757, "epoch": 1.2985516787360105, "grad_norm": 1.6084260940551758, "learning_rate": 6.010346486845837e-06, "loss": 0.0238, "mean_token_accuracy": 0.9926587820053101, "num_tokens": 8408773.0, "step": 3945 }, { "entropy": 1.682581651210785, "epoch": 1.3001974983541804, "grad_norm": 2.0567915439605713, "learning_rate": 5.985406911202263e-06, "loss": 0.0169, "mean_token_accuracy": 0.9948483347892761, "num_tokens": 8419245.0, "step": 3950 }, { "entropy": 1.6870299458503724, "epoch": 1.3018433179723503, "grad_norm": 2.500347375869751, "learning_rate": 5.960497067849627e-06, "loss": 0.0298, "mean_token_accuracy": 0.9893729627132416, "num_tokens": 8430019.0, "step": 3955 }, { "entropy": 1.7270505785942079, "epoch": 1.3034891375905202, "grad_norm": 2.2009787559509277, "learning_rate": 5.935617141271554e-06, "loss": 0.0255, "mean_token_accuracy": 0.9894290864467621, "num_tokens": 8440514.0, "step": 3960 }, { "entropy": 1.7091917991638184, "epoch": 1.3051349572086899, "grad_norm": 3.052794933319092, "learning_rate": 5.910767315730119e-06, "loss": 0.0366, "mean_token_accuracy": 0.9881552219390869, "num_tokens": 8451099.0, "step": 3965 }, { "entropy": 1.7440539956092835, "epoch": 1.3067807768268598, "grad_norm": 1.05941641330719, "learning_rate": 5.885947775264447e-06, "loss": 0.0241, "mean_token_accuracy": 0.9918931663036347, "num_tokens": 8461462.0, "step": 3970 }, { "entropy": 1.751047670841217, "epoch": 1.3084265964450297, "grad_norm": 1.5784897804260254, "learning_rate": 5.861158703689389e-06, "loss": 0.0216, "mean_token_accuracy": 0.9915867269039154, "num_tokens": 8472292.0, "step": 3975 }, { "entropy": 1.7028122067451477, "epoch": 1.3100724160631994, "grad_norm": 1.145462155342102, "learning_rate": 5.836400284594126e-06, "loss": 0.0247, "mean_token_accuracy": 0.9926209390163422, "num_tokens": 8482780.0, "step": 3980 }, { "entropy": 1.7101820945739745, "epoch": 1.3117182356813692, "grad_norm": 1.7238191366195679, "learning_rate": 5.811672701340847e-06, "loss": 0.0314, "mean_token_accuracy": 0.9896239399909973, "num_tokens": 8493301.0, "step": 3985 }, { "entropy": 1.7027180314064025, "epoch": 1.3133640552995391, "grad_norm": 1.4847729206085205, "learning_rate": 5.786976137063336e-06, "loss": 0.0209, "mean_token_accuracy": 0.9931684076786041, "num_tokens": 8504099.0, "step": 3990 }, { "entropy": 1.7060617804527283, "epoch": 1.315009874917709, "grad_norm": 0.6948561072349548, "learning_rate": 5.762310774665682e-06, "loss": 0.0404, "mean_token_accuracy": 0.9896627187728881, "num_tokens": 8514523.0, "step": 3995 }, { "entropy": 1.7185169339179993, "epoch": 1.316655694535879, "grad_norm": 1.8867969512939453, "learning_rate": 5.737676796820871e-06, "loss": 0.018, "mean_token_accuracy": 0.9954242944717407, "num_tokens": 8525127.0, "step": 4000 }, { "entropy": 1.6883883237838746, "epoch": 1.3183015141540486, "grad_norm": 2.294797420501709, "learning_rate": 5.713074385969457e-06, "loss": 0.018, "mean_token_accuracy": 0.9943160593509675, "num_tokens": 8535541.0, "step": 4005 }, { "entropy": 1.6336073637008668, "epoch": 1.3199473337722185, "grad_norm": 0.9374617338180542, "learning_rate": 5.688503724318217e-06, "loss": 0.0102, "mean_token_accuracy": 0.9958161950111389, "num_tokens": 8546220.0, "step": 4010 }, { "entropy": 1.6534652829170227, "epoch": 1.3215931533903884, "grad_norm": 1.0187243223190308, "learning_rate": 5.663964993838779e-06, "loss": 0.0204, "mean_token_accuracy": 0.9947121858596801, "num_tokens": 8556714.0, "step": 4015 }, { "entropy": 1.6586306810379028, "epoch": 1.3232389730085583, "grad_norm": 2.5042691230773926, "learning_rate": 5.639458376266295e-06, "loss": 0.0344, "mean_token_accuracy": 0.9897667288780212, "num_tokens": 8567099.0, "step": 4020 }, { "entropy": 1.6488097548484801, "epoch": 1.3248847926267282, "grad_norm": 0.3930192291736603, "learning_rate": 5.614984053098076e-06, "loss": 0.029, "mean_token_accuracy": 0.9916586577892303, "num_tokens": 8577866.0, "step": 4025 }, { "entropy": 1.6415641903877258, "epoch": 1.3265306122448979, "grad_norm": 1.2496612071990967, "learning_rate": 5.590542205592283e-06, "loss": 0.0288, "mean_token_accuracy": 0.9903595089912415, "num_tokens": 8588800.0, "step": 4030 }, { "entropy": 1.6364824771881104, "epoch": 1.3281764318630678, "grad_norm": 2.232058048248291, "learning_rate": 5.566133014766526e-06, "loss": 0.0333, "mean_token_accuracy": 0.9903694272041321, "num_tokens": 8599559.0, "step": 4035 }, { "entropy": 1.699565863609314, "epoch": 1.3298222514812377, "grad_norm": 2.588693618774414, "learning_rate": 5.541756661396591e-06, "loss": 0.0229, "mean_token_accuracy": 0.9930707275867462, "num_tokens": 8610049.0, "step": 4040 }, { "entropy": 1.6720836758613586, "epoch": 1.3314680710994076, "grad_norm": 1.1052114963531494, "learning_rate": 5.517413326015046e-06, "loss": 0.0325, "mean_token_accuracy": 0.9897372424602509, "num_tokens": 8620905.0, "step": 4045 }, { "entropy": 1.7122917652130127, "epoch": 1.3331138907175775, "grad_norm": 1.7371225357055664, "learning_rate": 5.493103188909939e-06, "loss": 0.0256, "mean_token_accuracy": 0.9902097463607789, "num_tokens": 8631605.0, "step": 4050 }, { "entropy": 1.7047467947006225, "epoch": 1.3347597103357471, "grad_norm": 3.5729825496673584, "learning_rate": 5.468826430123435e-06, "loss": 0.021, "mean_token_accuracy": 0.992638248205185, "num_tokens": 8642346.0, "step": 4055 }, { "entropy": 1.7031975507736206, "epoch": 1.336405529953917, "grad_norm": 2.91983962059021, "learning_rate": 5.444583229450518e-06, "loss": 0.0317, "mean_token_accuracy": 0.9902465045452118, "num_tokens": 8653100.0, "step": 4060 }, { "entropy": 1.697403860092163, "epoch": 1.338051349572087, "grad_norm": 1.3388501405715942, "learning_rate": 5.4203737664376235e-06, "loss": 0.0412, "mean_token_accuracy": 0.9881835162639618, "num_tokens": 8663816.0, "step": 4065 }, { "entropy": 1.7462014317512513, "epoch": 1.3396971691902566, "grad_norm": 5.225465774536133, "learning_rate": 5.3961982203813215e-06, "loss": 0.0383, "mean_token_accuracy": 0.9885516107082367, "num_tokens": 8674657.0, "step": 4070 }, { "entropy": 1.7663294553756714, "epoch": 1.3413429888084267, "grad_norm": 1.6548188924789429, "learning_rate": 5.3720567703270135e-06, "loss": 0.0239, "mean_token_accuracy": 0.99189293384552, "num_tokens": 8685457.0, "step": 4075 }, { "entropy": 1.7495159268379212, "epoch": 1.3429888084265964, "grad_norm": 1.063901662826538, "learning_rate": 5.34794959506755e-06, "loss": 0.0237, "mean_token_accuracy": 0.9917633235454559, "num_tokens": 8695979.0, "step": 4080 }, { "entropy": 1.7705445170402527, "epoch": 1.3446346280447663, "grad_norm": 0.8427658677101135, "learning_rate": 5.323876873141973e-06, "loss": 0.0224, "mean_token_accuracy": 0.9938309490680695, "num_tokens": 8706574.0, "step": 4085 }, { "entropy": 1.7015425443649292, "epoch": 1.3462804476629362, "grad_norm": 4.552727222442627, "learning_rate": 5.299838782834141e-06, "loss": 0.0273, "mean_token_accuracy": 0.9894029378890992, "num_tokens": 8717255.0, "step": 4090 }, { "entropy": 1.7069133877754212, "epoch": 1.3479262672811059, "grad_norm": 4.266534805297852, "learning_rate": 5.275835502171439e-06, "loss": 0.024, "mean_token_accuracy": 0.992630559206009, "num_tokens": 8728164.0, "step": 4095 }, { "entropy": 1.730119228363037, "epoch": 1.3495720868992758, "grad_norm": 2.2071967124938965, "learning_rate": 5.251867208923439e-06, "loss": 0.0151, "mean_token_accuracy": 0.9952519774436951, "num_tokens": 8738646.0, "step": 4100 }, { "entropy": 1.7257731676101684, "epoch": 1.3512179065174457, "grad_norm": 3.7456114292144775, "learning_rate": 5.227934080600611e-06, "loss": 0.032, "mean_token_accuracy": 0.9912447273731232, "num_tokens": 8749142.0, "step": 4105 }, { "entropy": 1.6916332483291625, "epoch": 1.3528637261356156, "grad_norm": 2.742368459701538, "learning_rate": 5.2040362944529765e-06, "loss": 0.0182, "mean_token_accuracy": 0.9944073140621186, "num_tokens": 8759739.0, "step": 4110 }, { "entropy": 1.6806737899780273, "epoch": 1.3545095457537855, "grad_norm": 1.4229061603546143, "learning_rate": 5.180174027468818e-06, "loss": 0.0184, "mean_token_accuracy": 0.9935629963874817, "num_tokens": 8770377.0, "step": 4115 }, { "entropy": 1.6647310853004456, "epoch": 1.3561553653719551, "grad_norm": 2.2806296348571777, "learning_rate": 5.156347456373359e-06, "loss": 0.0251, "mean_token_accuracy": 0.9908474087715149, "num_tokens": 8780910.0, "step": 4120 }, { "entropy": 1.6626045823097229, "epoch": 1.357801184990125, "grad_norm": 1.6852504014968872, "learning_rate": 5.1325567576274595e-06, "loss": 0.0328, "mean_token_accuracy": 0.9896870791912079, "num_tokens": 8791765.0, "step": 4125 }, { "entropy": 1.7072691798210144, "epoch": 1.359447004608295, "grad_norm": 0.524549126625061, "learning_rate": 5.108802107426307e-06, "loss": 0.0152, "mean_token_accuracy": 0.9940847456455231, "num_tokens": 8802312.0, "step": 4130 }, { "entropy": 1.7237987995147706, "epoch": 1.3610928242264648, "grad_norm": 2.021660566329956, "learning_rate": 5.085083681698108e-06, "loss": 0.0437, "mean_token_accuracy": 0.9889907777309418, "num_tokens": 8813114.0, "step": 4135 }, { "entropy": 1.7635319709777832, "epoch": 1.3627386438446347, "grad_norm": 3.0634493827819824, "learning_rate": 5.061401656102791e-06, "loss": 0.04, "mean_token_accuracy": 0.9863172173500061, "num_tokens": 8823776.0, "step": 4140 }, { "entropy": 1.7534740567207336, "epoch": 1.3643844634628044, "grad_norm": 1.1463490724563599, "learning_rate": 5.0377562060307e-06, "loss": 0.032, "mean_token_accuracy": 0.9871594190597535, "num_tokens": 8834704.0, "step": 4145 }, { "entropy": 1.7889039039611816, "epoch": 1.3660302830809743, "grad_norm": 0.34571316838264465, "learning_rate": 5.014147506601308e-06, "loss": 0.0239, "mean_token_accuracy": 0.9936646819114685, "num_tokens": 8845254.0, "step": 4150 }, { "entropy": 1.8070958495140075, "epoch": 1.3676761026991442, "grad_norm": 1.5474847555160522, "learning_rate": 4.990575732661902e-06, "loss": 0.0287, "mean_token_accuracy": 0.9930408537387848, "num_tokens": 8855930.0, "step": 4155 }, { "entropy": 1.7869633316993714, "epoch": 1.369321922317314, "grad_norm": 1.3527427911758423, "learning_rate": 4.9670410587862995e-06, "loss": 0.0341, "mean_token_accuracy": 0.9886087715625763, "num_tokens": 8866777.0, "step": 4160 }, { "entropy": 1.8497827887535094, "epoch": 1.370967741935484, "grad_norm": 1.8562260866165161, "learning_rate": 4.943543659273548e-06, "loss": 0.0476, "mean_token_accuracy": 0.9888940870761871, "num_tokens": 8877374.0, "step": 4165 }, { "entropy": 1.8215659737586976, "epoch": 1.3726135615536537, "grad_norm": 1.3257911205291748, "learning_rate": 4.920083708146655e-06, "loss": 0.0185, "mean_token_accuracy": 0.9913751244544983, "num_tokens": 8888152.0, "step": 4170 }, { "entropy": 1.8450712442398072, "epoch": 1.3742593811718236, "grad_norm": 0.5730196833610535, "learning_rate": 4.896661379151259e-06, "loss": 0.0186, "mean_token_accuracy": 0.9925144612789154, "num_tokens": 8898690.0, "step": 4175 }, { "entropy": 1.8470216035842895, "epoch": 1.3759052007899935, "grad_norm": 1.4637004137039185, "learning_rate": 4.873276845754388e-06, "loss": 0.0141, "mean_token_accuracy": 0.9950567662715912, "num_tokens": 8909192.0, "step": 4180 }, { "entropy": 1.7927292108535766, "epoch": 1.3775510204081631, "grad_norm": 2.084494113922119, "learning_rate": 4.849930281143146e-06, "loss": 0.0424, "mean_token_accuracy": 0.9881355404853821, "num_tokens": 8920016.0, "step": 4185 }, { "entropy": 1.815053677558899, "epoch": 1.3791968400263332, "grad_norm": 1.7285300493240356, "learning_rate": 4.826621858223431e-06, "loss": 0.0247, "mean_token_accuracy": 0.992021232843399, "num_tokens": 8930392.0, "step": 4190 }, { "entropy": 1.80452299118042, "epoch": 1.380842659644503, "grad_norm": 4.97409725189209, "learning_rate": 4.803351749618679e-06, "loss": 0.0234, "mean_token_accuracy": 0.9906729996204376, "num_tokens": 8940931.0, "step": 4195 }, { "entropy": 1.7994644165039062, "epoch": 1.3824884792626728, "grad_norm": 1.7542165517807007, "learning_rate": 4.780120127668553e-06, "loss": 0.019, "mean_token_accuracy": 0.9934443414211274, "num_tokens": 8951603.0, "step": 4200 }, { "entropy": 1.800536823272705, "epoch": 1.3841342988808427, "grad_norm": 1.1492308378219604, "learning_rate": 4.756927164427685e-06, "loss": 0.0318, "mean_token_accuracy": 0.9890105724334717, "num_tokens": 8962110.0, "step": 4205 }, { "entropy": 1.7841524839401246, "epoch": 1.3857801184990124, "grad_norm": 2.564023971557617, "learning_rate": 4.733773031664398e-06, "loss": 0.0292, "mean_token_accuracy": 0.9909674048423767, "num_tokens": 8972933.0, "step": 4210 }, { "entropy": 1.8291552066802979, "epoch": 1.3874259381171823, "grad_norm": 0.70340496301651, "learning_rate": 4.710657900859447e-06, "loss": 0.0205, "mean_token_accuracy": 0.9959937572479248, "num_tokens": 8983508.0, "step": 4215 }, { "entropy": 1.8385101079940795, "epoch": 1.3890717577353522, "grad_norm": 1.934988021850586, "learning_rate": 4.687581943204711e-06, "loss": 0.031, "mean_token_accuracy": 0.9913569748401642, "num_tokens": 8994226.0, "step": 4220 }, { "entropy": 1.8304347276687623, "epoch": 1.390717577353522, "grad_norm": 1.6579409837722778, "learning_rate": 4.664545329601977e-06, "loss": 0.0269, "mean_token_accuracy": 0.9913767516613007, "num_tokens": 9004843.0, "step": 4225 }, { "entropy": 1.842625379562378, "epoch": 1.392363396971692, "grad_norm": 1.2633566856384277, "learning_rate": 4.641548230661633e-06, "loss": 0.0327, "mean_token_accuracy": 0.990666514635086, "num_tokens": 9015138.0, "step": 4230 }, { "entropy": 1.822058653831482, "epoch": 1.3940092165898617, "grad_norm": 2.8293750286102295, "learning_rate": 4.618590816701422e-06, "loss": 0.0265, "mean_token_accuracy": 0.9897272527217865, "num_tokens": 9025856.0, "step": 4235 }, { "entropy": 1.8393550515174866, "epoch": 1.3956550362080316, "grad_norm": 0.7082259654998779, "learning_rate": 4.5956732577451745e-06, "loss": 0.0155, "mean_token_accuracy": 0.9960557222366333, "num_tokens": 9036445.0, "step": 4240 }, { "entropy": 1.8048294067382813, "epoch": 1.3973008558262014, "grad_norm": 0.9008603096008301, "learning_rate": 4.572795723521565e-06, "loss": 0.0173, "mean_token_accuracy": 0.9946598470211029, "num_tokens": 9047123.0, "step": 4245 }, { "entropy": 1.8322034358978272, "epoch": 1.3989466754443713, "grad_norm": 0.7377194166183472, "learning_rate": 4.549958383462829e-06, "loss": 0.0232, "mean_token_accuracy": 0.9925904154777527, "num_tokens": 9057591.0, "step": 4250 }, { "entropy": 1.8085259079933167, "epoch": 1.4005924950625412, "grad_norm": 0.7783912420272827, "learning_rate": 4.527161406703525e-06, "loss": 0.0381, "mean_token_accuracy": 0.9926593840122223, "num_tokens": 9068234.0, "step": 4255 }, { "epoch": 1.400921658986175, "eval_entropy": 1.8214755838130325, "eval_loss": 0.05032181739807129, "eval_mean_token_accuracy": 0.9866692206959216, "eval_num_tokens": 9070425.0, "eval_runtime": 195.1937, "eval_samples_per_second": 42.66, "eval_steps_per_second": 7.111, "step": 4256 }, { "entropy": 1.8127322912216186, "epoch": 1.402238314680711, "grad_norm": 2.2625997066497803, "learning_rate": 4.504404962079293e-06, "loss": 0.0238, "mean_token_accuracy": 0.9912942230701447, "num_tokens": 9079026.0, "step": 4260 }, { "entropy": 1.825773298740387, "epoch": 1.4038841342988808, "grad_norm": 1.3523027896881104, "learning_rate": 4.481689218125561e-06, "loss": 0.0213, "mean_token_accuracy": 0.9924202978610992, "num_tokens": 9089666.0, "step": 4265 }, { "entropy": 1.790573537349701, "epoch": 1.4055299539170507, "grad_norm": 2.0441343784332275, "learning_rate": 4.459014343076356e-06, "loss": 0.0412, "mean_token_accuracy": 0.9893443644046783, "num_tokens": 9100438.0, "step": 4270 }, { "entropy": 1.8189255356788636, "epoch": 1.4071757735352206, "grad_norm": 1.1708464622497559, "learning_rate": 4.436380504863008e-06, "loss": 0.0169, "mean_token_accuracy": 0.9938211143016815, "num_tokens": 9111194.0, "step": 4275 }, { "entropy": 1.8196102857589722, "epoch": 1.4088215931533905, "grad_norm": 2.159729242324829, "learning_rate": 4.413787871112934e-06, "loss": 0.0197, "mean_token_accuracy": 0.9932897388935089, "num_tokens": 9122132.0, "step": 4280 }, { "entropy": 1.805807101726532, "epoch": 1.4104674127715602, "grad_norm": 2.360366106033325, "learning_rate": 4.391236609148381e-06, "loss": 0.0336, "mean_token_accuracy": 0.9923444032669068, "num_tokens": 9132718.0, "step": 4285 }, { "entropy": 1.813056230545044, "epoch": 1.41211323238973, "grad_norm": 0.6563708782196045, "learning_rate": 4.3687268859852105e-06, "loss": 0.0137, "mean_token_accuracy": 0.9964501917362213, "num_tokens": 9143366.0, "step": 4290 }, { "entropy": 1.8228686213493348, "epoch": 1.4137590520079, "grad_norm": 1.703528642654419, "learning_rate": 4.34625886833163e-06, "loss": 0.019, "mean_token_accuracy": 0.9924368858337402, "num_tokens": 9154131.0, "step": 4295 }, { "entropy": 1.763486886024475, "epoch": 1.4154048716260696, "grad_norm": 2.912816286087036, "learning_rate": 4.323832722586979e-06, "loss": 0.0141, "mean_token_accuracy": 0.9942897677421569, "num_tokens": 9165049.0, "step": 4300 }, { "entropy": 1.8135482668876648, "epoch": 1.4170506912442398, "grad_norm": 1.8574903011322021, "learning_rate": 4.301448614840487e-06, "loss": 0.0423, "mean_token_accuracy": 0.9888120889663696, "num_tokens": 9175582.0, "step": 4305 }, { "entropy": 1.8078377962112426, "epoch": 1.4186965108624094, "grad_norm": 1.1886160373687744, "learning_rate": 4.279106710870059e-06, "loss": 0.0164, "mean_token_accuracy": 0.9940821826457977, "num_tokens": 9186081.0, "step": 4310 }, { "entropy": 1.7859516978263854, "epoch": 1.4203423304805793, "grad_norm": 1.7242977619171143, "learning_rate": 4.256807176141028e-06, "loss": 0.0131, "mean_token_accuracy": 0.99482541680336, "num_tokens": 9196757.0, "step": 4315 }, { "entropy": 1.7669077515602112, "epoch": 1.4219881500987492, "grad_norm": 2.3228442668914795, "learning_rate": 4.2345501758049365e-06, "loss": 0.0278, "mean_token_accuracy": 0.9920706391334534, "num_tokens": 9207097.0, "step": 4320 }, { "entropy": 1.8107630848884582, "epoch": 1.423633969716919, "grad_norm": 2.6625027656555176, "learning_rate": 4.2123358746983225e-06, "loss": 0.041, "mean_token_accuracy": 0.9908157408237457, "num_tokens": 9217832.0, "step": 4325 }, { "entropy": 1.8029837489128113, "epoch": 1.4252797893350888, "grad_norm": 1.867008090019226, "learning_rate": 4.190164437341479e-06, "loss": 0.0204, "mean_token_accuracy": 0.9934187710285187, "num_tokens": 9228324.0, "step": 4330 }, { "entropy": 1.7727699518203734, "epoch": 1.4269256089532587, "grad_norm": 5.103766918182373, "learning_rate": 4.168036027937267e-06, "loss": 0.0227, "mean_token_accuracy": 0.9946336686611176, "num_tokens": 9239199.0, "step": 4335 }, { "entropy": 1.8098699450492859, "epoch": 1.4285714285714286, "grad_norm": 0.7097901105880737, "learning_rate": 4.145950810369863e-06, "loss": 0.0226, "mean_token_accuracy": 0.9928686380386352, "num_tokens": 9249956.0, "step": 4340 }, { "entropy": 1.7923429489135743, "epoch": 1.4302172481895985, "grad_norm": 1.6268037557601929, "learning_rate": 4.1239089482035686e-06, "loss": 0.0244, "mean_token_accuracy": 0.9906105041503906, "num_tokens": 9260596.0, "step": 4345 }, { "entropy": 1.7969419360160828, "epoch": 1.4318630678077682, "grad_norm": 1.7350503206253052, "learning_rate": 4.10191060468159e-06, "loss": 0.0182, "mean_token_accuracy": 0.9944913208484649, "num_tokens": 9271060.0, "step": 4350 }, { "entropy": 1.7492168545722961, "epoch": 1.433508887425938, "grad_norm": 3.6994593143463135, "learning_rate": 4.079955942724845e-06, "loss": 0.0326, "mean_token_accuracy": 0.989514821767807, "num_tokens": 9281476.0, "step": 4355 }, { "entropy": 1.720914614200592, "epoch": 1.435154707044108, "grad_norm": 0.9848082661628723, "learning_rate": 4.0580451249307195e-06, "loss": 0.0215, "mean_token_accuracy": 0.9936264753341675, "num_tokens": 9292136.0, "step": 4360 }, { "entropy": 1.7023021578788757, "epoch": 1.4368005266622779, "grad_norm": 0.8126395344734192, "learning_rate": 4.03617831357191e-06, "loss": 0.0172, "mean_token_accuracy": 0.994192260503769, "num_tokens": 9302979.0, "step": 4365 }, { "entropy": 1.6948873043060302, "epoch": 1.4384463462804478, "grad_norm": 4.326878547668457, "learning_rate": 4.014355670595189e-06, "loss": 0.0211, "mean_token_accuracy": 0.9916200697422027, "num_tokens": 9313652.0, "step": 4370 }, { "entropy": 1.7300003051757813, "epoch": 1.4400921658986174, "grad_norm": 0.8770890235900879, "learning_rate": 3.99257735762021e-06, "loss": 0.0311, "mean_token_accuracy": 0.9879982471466064, "num_tokens": 9324219.0, "step": 4375 }, { "entropy": 1.728563916683197, "epoch": 1.4417379855167873, "grad_norm": 1.0835927724838257, "learning_rate": 3.970843535938332e-06, "loss": 0.0218, "mean_token_accuracy": 0.9914672791957855, "num_tokens": 9334800.0, "step": 4380 }, { "entropy": 1.7277835011482239, "epoch": 1.4433838051349572, "grad_norm": 1.7556816339492798, "learning_rate": 3.949154366511395e-06, "loss": 0.024, "mean_token_accuracy": 0.9926247537136078, "num_tokens": 9345539.0, "step": 4385 }, { "entropy": 1.729397702217102, "epoch": 1.4450296247531271, "grad_norm": 0.777664840221405, "learning_rate": 3.927510009970548e-06, "loss": 0.0231, "mean_token_accuracy": 0.9917639851570129, "num_tokens": 9356267.0, "step": 4390 }, { "entropy": 1.7339849829673768, "epoch": 1.446675444371297, "grad_norm": 2.712010383605957, "learning_rate": 3.905910626615046e-06, "loss": 0.0407, "mean_token_accuracy": 0.9871343851089478, "num_tokens": 9367167.0, "step": 4395 }, { "entropy": 1.7719183206558227, "epoch": 1.4483212639894667, "grad_norm": 1.5582504272460938, "learning_rate": 3.884356376411089e-06, "loss": 0.0214, "mean_token_accuracy": 0.9926240026950837, "num_tokens": 9377539.0, "step": 4400 }, { "entropy": 1.7510367512702942, "epoch": 1.4499670836076366, "grad_norm": 1.32984459400177, "learning_rate": 3.862847418990592e-06, "loss": 0.0321, "mean_token_accuracy": 0.9916741907596588, "num_tokens": 9388175.0, "step": 4405 }, { "entropy": 1.7010186672210694, "epoch": 1.4516129032258065, "grad_norm": 2.487252950668335, "learning_rate": 3.841383913650052e-06, "loss": 0.0429, "mean_token_accuracy": 0.9880555152893067, "num_tokens": 9398803.0, "step": 4410 }, { "entropy": 1.7146742224693299, "epoch": 1.4532587228439762, "grad_norm": 1.8718352317810059, "learning_rate": 3.819966019349334e-06, "loss": 0.0141, "mean_token_accuracy": 0.9953644871711731, "num_tokens": 9409524.0, "step": 4415 }, { "entropy": 1.6977456569671632, "epoch": 1.454904542462146, "grad_norm": 0.736453115940094, "learning_rate": 3.7985938947105073e-06, "loss": 0.0157, "mean_token_accuracy": 0.9962522923946381, "num_tokens": 9420376.0, "step": 4420 }, { "entropy": 1.7264664053916932, "epoch": 1.456550362080316, "grad_norm": 1.4576939344406128, "learning_rate": 3.7772676980166655e-06, "loss": 0.0282, "mean_token_accuracy": 0.9920338273048401, "num_tokens": 9430868.0, "step": 4425 }, { "entropy": 1.6992250204086303, "epoch": 1.4581961816984859, "grad_norm": 1.5100425481796265, "learning_rate": 3.7559875872107677e-06, "loss": 0.0359, "mean_token_accuracy": 0.9908050179481507, "num_tokens": 9441532.0, "step": 4430 }, { "entropy": 1.7052590131759644, "epoch": 1.4598420013166558, "grad_norm": 2.6018645763397217, "learning_rate": 3.7347537198944483e-06, "loss": 0.0213, "mean_token_accuracy": 0.992498379945755, "num_tokens": 9452300.0, "step": 4435 }, { "entropy": 1.6945829153060914, "epoch": 1.4614878209348254, "grad_norm": 1.3602592945098877, "learning_rate": 3.71356625332686e-06, "loss": 0.0188, "mean_token_accuracy": 0.9949553668498993, "num_tokens": 9462853.0, "step": 4440 }, { "entropy": 1.6705161213874817, "epoch": 1.4631336405529953, "grad_norm": 0.6543161273002625, "learning_rate": 3.6924253444235224e-06, "loss": 0.0178, "mean_token_accuracy": 0.9919328927993775, "num_tokens": 9473560.0, "step": 4445 }, { "entropy": 1.6843745946884154, "epoch": 1.4647794601711652, "grad_norm": 1.519394874572754, "learning_rate": 3.671331149755123e-06, "loss": 0.0143, "mean_token_accuracy": 0.9956124365329743, "num_tokens": 9484022.0, "step": 4450 }, { "entropy": 1.6935065150260926, "epoch": 1.4664252797893351, "grad_norm": 5.51362943649292, "learning_rate": 3.6502838255464045e-06, "loss": 0.0319, "mean_token_accuracy": 0.9897434711456299, "num_tokens": 9494354.0, "step": 4455 }, { "entropy": 1.632759189605713, "epoch": 1.468071099407505, "grad_norm": 1.1584962606430054, "learning_rate": 3.6292835276749715e-06, "loss": 0.0238, "mean_token_accuracy": 0.992765587568283, "num_tokens": 9505092.0, "step": 4460 }, { "entropy": 1.6529253125190735, "epoch": 1.4697169190256747, "grad_norm": 0.6863316893577576, "learning_rate": 3.6083304116701535e-06, "loss": 0.018, "mean_token_accuracy": 0.9955762267112732, "num_tokens": 9515748.0, "step": 4465 }, { "entropy": 1.6669641017913819, "epoch": 1.4713627386438446, "grad_norm": 0.19445084035396576, "learning_rate": 3.587424632711841e-06, "loss": 0.0305, "mean_token_accuracy": 0.9905413866043091, "num_tokens": 9526278.0, "step": 4470 }, { "entropy": 1.6730605006217956, "epoch": 1.4730085582620145, "grad_norm": 2.6678473949432373, "learning_rate": 3.56656634562936e-06, "loss": 0.024, "mean_token_accuracy": 0.9918078184127808, "num_tokens": 9537087.0, "step": 4475 }, { "entropy": 1.6874117732048035, "epoch": 1.4746543778801844, "grad_norm": 0.8383910655975342, "learning_rate": 3.5457557049002934e-06, "loss": 0.0173, "mean_token_accuracy": 0.9937337219715119, "num_tokens": 9547735.0, "step": 4480 }, { "entropy": 1.68024080991745, "epoch": 1.4763001974983543, "grad_norm": 2.533155918121338, "learning_rate": 3.52499286464936e-06, "loss": 0.0287, "mean_token_accuracy": 0.9925665855407715, "num_tokens": 9558405.0, "step": 4485 }, { "entropy": 1.6580557942390441, "epoch": 1.477946017116524, "grad_norm": 4.7806830406188965, "learning_rate": 3.5042779786472602e-06, "loss": 0.0245, "mean_token_accuracy": 0.9935833632946014, "num_tokens": 9569152.0, "step": 4490 }, { "entropy": 1.6794435620307921, "epoch": 1.4795918367346939, "grad_norm": 2.641050100326538, "learning_rate": 3.4836112003095524e-06, "loss": 0.0237, "mean_token_accuracy": 0.9917592287063599, "num_tokens": 9579673.0, "step": 4495 }, { "entropy": 1.6873149633407594, "epoch": 1.4812376563528638, "grad_norm": 1.5926048755645752, "learning_rate": 3.4629926826954997e-06, "loss": 0.0347, "mean_token_accuracy": 0.9919157564640045, "num_tokens": 9590146.0, "step": 4500 }, { "entropy": 1.6695775389671326, "epoch": 1.4828834759710336, "grad_norm": 3.867030382156372, "learning_rate": 3.4424225785069444e-06, "loss": 0.0248, "mean_token_accuracy": 0.993567830324173, "num_tokens": 9600795.0, "step": 4505 }, { "entropy": 1.6816540002822875, "epoch": 1.4845292955892035, "grad_norm": 2.597640037536621, "learning_rate": 3.421901040087177e-06, "loss": 0.0253, "mean_token_accuracy": 0.9921609818935394, "num_tokens": 9611539.0, "step": 4510 }, { "entropy": 1.6816054821014403, "epoch": 1.4861751152073732, "grad_norm": 0.9247928857803345, "learning_rate": 3.4014282194198045e-06, "loss": 0.0328, "mean_token_accuracy": 0.9923509240150452, "num_tokens": 9621948.0, "step": 4515 }, { "entropy": 1.7215697646141053, "epoch": 1.4878209348255431, "grad_norm": 1.9585981369018555, "learning_rate": 3.381004268127638e-06, "loss": 0.0158, "mean_token_accuracy": 0.995547354221344, "num_tokens": 9632351.0, "step": 4520 }, { "entropy": 1.7168682098388672, "epoch": 1.489466754443713, "grad_norm": 1.5342450141906738, "learning_rate": 3.360629337471548e-06, "loss": 0.0276, "mean_token_accuracy": 0.9930002510547637, "num_tokens": 9643093.0, "step": 4525 }, { "entropy": 1.6930954933166504, "epoch": 1.4911125740618827, "grad_norm": 1.625235915184021, "learning_rate": 3.340303578349361e-06, "loss": 0.0175, "mean_token_accuracy": 0.9939290165901185, "num_tokens": 9653833.0, "step": 4530 }, { "entropy": 1.7007075071334838, "epoch": 1.4927583936800526, "grad_norm": 0.7999654412269592, "learning_rate": 3.3200271412947294e-06, "loss": 0.0154, "mean_token_accuracy": 0.9946927070617676, "num_tokens": 9664334.0, "step": 4535 }, { "entropy": 1.670901095867157, "epoch": 1.4944042132982225, "grad_norm": 0.8723247647285461, "learning_rate": 3.2998001764760414e-06, "loss": 0.0173, "mean_token_accuracy": 0.9957589745521546, "num_tokens": 9675067.0, "step": 4540 }, { "entropy": 1.7091269135475158, "epoch": 1.4960500329163924, "grad_norm": 3.8337857723236084, "learning_rate": 3.2796228336952663e-06, "loss": 0.0306, "mean_token_accuracy": 0.9932047963142395, "num_tokens": 9685487.0, "step": 4545 }, { "entropy": 1.7051352858543396, "epoch": 1.4976958525345623, "grad_norm": 2.0214622020721436, "learning_rate": 3.259495262386888e-06, "loss": 0.0257, "mean_token_accuracy": 0.9929771959781647, "num_tokens": 9696067.0, "step": 4550 }, { "entropy": 1.716159415245056, "epoch": 1.499341672152732, "grad_norm": 1.7375526428222656, "learning_rate": 3.2394176116167818e-06, "loss": 0.0188, "mean_token_accuracy": 0.9942406177520752, "num_tokens": 9706729.0, "step": 4555 }, { "entropy": 1.712713897228241, "epoch": 1.500987491770902, "grad_norm": 3.6430742740631104, "learning_rate": 3.2193900300810908e-06, "loss": 0.0252, "mean_token_accuracy": 0.9919379532337189, "num_tokens": 9717306.0, "step": 4560 }, { "entropy": 1.6966816425323485, "epoch": 1.5026333113890717, "grad_norm": 0.11951875686645508, "learning_rate": 3.1994126661051628e-06, "loss": 0.0111, "mean_token_accuracy": 0.9968602657318115, "num_tokens": 9728013.0, "step": 4565 }, { "entropy": 1.7037412881851197, "epoch": 1.5042791310072416, "grad_norm": 1.588755488395691, "learning_rate": 3.179485667642419e-06, "loss": 0.0284, "mean_token_accuracy": 0.989536440372467, "num_tokens": 9738874.0, "step": 4570 }, { "entropy": 1.6956771850585937, "epoch": 1.5059249506254115, "grad_norm": 0.9138298034667969, "learning_rate": 3.1596091822732755e-06, "loss": 0.013, "mean_token_accuracy": 0.994870787858963, "num_tokens": 9749398.0, "step": 4575 }, { "entropy": 1.7180339097976685, "epoch": 1.5075707702435812, "grad_norm": 3.6119987964630127, "learning_rate": 3.1397833572040414e-06, "loss": 0.0471, "mean_token_accuracy": 0.9881613373756408, "num_tokens": 9760121.0, "step": 4580 }, { "entropy": 1.7082261800765992, "epoch": 1.5092165898617511, "grad_norm": 3.047520875930786, "learning_rate": 3.1200083392658464e-06, "loss": 0.0208, "mean_token_accuracy": 0.9932926952838897, "num_tokens": 9770729.0, "step": 4585 }, { "entropy": 1.696124267578125, "epoch": 1.510862409479921, "grad_norm": 4.276211738586426, "learning_rate": 3.1002842749135175e-06, "loss": 0.025, "mean_token_accuracy": 0.992525440454483, "num_tokens": 9781733.0, "step": 4590 }, { "entropy": 1.689008605480194, "epoch": 1.5125082290980907, "grad_norm": 0.7173812389373779, "learning_rate": 3.0806113102245395e-06, "loss": 0.027, "mean_token_accuracy": 0.9932247400283813, "num_tokens": 9792414.0, "step": 4595 }, { "entropy": 1.6812505960464477, "epoch": 1.5141540487162608, "grad_norm": 1.7421464920043945, "learning_rate": 3.0609895908979347e-06, "loss": 0.0297, "mean_token_accuracy": 0.9890223205089569, "num_tokens": 9802799.0, "step": 4600 }, { "entropy": 1.7080901622772218, "epoch": 1.5157998683344305, "grad_norm": 2.5039777755737305, "learning_rate": 3.041419262253208e-06, "loss": 0.0217, "mean_token_accuracy": 0.993123060464859, "num_tokens": 9813536.0, "step": 4605 }, { "entropy": 1.703446888923645, "epoch": 1.5174456879526004, "grad_norm": 1.0789647102355957, "learning_rate": 3.021900469229253e-06, "loss": 0.0183, "mean_token_accuracy": 0.9934683322906495, "num_tokens": 9824215.0, "step": 4610 }, { "entropy": 1.7133625149726868, "epoch": 1.5190915075707703, "grad_norm": 1.688781499862671, "learning_rate": 3.0024333563833007e-06, "loss": 0.0276, "mean_token_accuracy": 0.9893021941184997, "num_tokens": 9834752.0, "step": 4615 }, { "entropy": 1.7214069485664367, "epoch": 1.52073732718894, "grad_norm": 3.265244722366333, "learning_rate": 2.983018067889828e-06, "loss": 0.0352, "mean_token_accuracy": 0.9897504031658173, "num_tokens": 9845611.0, "step": 4620 }, { "entropy": 1.711961841583252, "epoch": 1.52238314680711, "grad_norm": 6.8311333656311035, "learning_rate": 2.963654747539494e-06, "loss": 0.0225, "mean_token_accuracy": 0.9920111656188965, "num_tokens": 9856980.0, "step": 4625 }, { "entropy": 1.7189836978912354, "epoch": 1.5240289664252797, "grad_norm": 1.7447693347930908, "learning_rate": 2.9443435387380936e-06, "loss": 0.012, "mean_token_accuracy": 0.9950068473815918, "num_tokens": 9867771.0, "step": 4630 }, { "entropy": 1.7515931606292725, "epoch": 1.5256747860434496, "grad_norm": 1.2482762336730957, "learning_rate": 2.9250845845054586e-06, "loss": 0.0279, "mean_token_accuracy": 0.9916983366012573, "num_tokens": 9878291.0, "step": 4635 }, { "entropy": 1.746778416633606, "epoch": 1.5273206056616195, "grad_norm": 0.6369831562042236, "learning_rate": 2.9058780274744426e-06, "loss": 0.0213, "mean_token_accuracy": 0.9942344844341278, "num_tokens": 9888741.0, "step": 4640 }, { "entropy": 1.741689419746399, "epoch": 1.5289664252797892, "grad_norm": 1.0881776809692383, "learning_rate": 2.8867240098898297e-06, "loss": 0.0212, "mean_token_accuracy": 0.9928946614265441, "num_tokens": 9899481.0, "step": 4645 }, { "entropy": 1.715234887599945, "epoch": 1.5306122448979593, "grad_norm": 2.7303154468536377, "learning_rate": 2.8676226736072975e-06, "loss": 0.0286, "mean_token_accuracy": 0.9908973276615143, "num_tokens": 9910093.0, "step": 4650 }, { "entropy": 1.7300845026969909, "epoch": 1.532258064516129, "grad_norm": 0.8988632559776306, "learning_rate": 2.848574160092362e-06, "loss": 0.0265, "mean_token_accuracy": 0.9913157403469086, "num_tokens": 9920988.0, "step": 4655 }, { "entropy": 1.7755305886268615, "epoch": 1.533903884134299, "grad_norm": 1.2511268854141235, "learning_rate": 2.829578610419337e-06, "loss": 0.0183, "mean_token_accuracy": 0.9922440350055695, "num_tokens": 9931448.0, "step": 4660 }, { "entropy": 1.7372457265853882, "epoch": 1.5355497037524688, "grad_norm": 1.8746650218963623, "learning_rate": 2.810636165270274e-06, "loss": 0.0448, "mean_token_accuracy": 0.9905406355857849, "num_tokens": 9942334.0, "step": 4665 }, { "entropy": 1.7496211767196654, "epoch": 1.5371955233706385, "grad_norm": 3.1501920223236084, "learning_rate": 2.7917469649339356e-06, "loss": 0.0177, "mean_token_accuracy": 0.9929421722888947, "num_tokens": 9952925.0, "step": 4670 }, { "entropy": 1.769037902355194, "epoch": 1.5388413429888086, "grad_norm": 1.5442581176757812, "learning_rate": 2.7729111493047458e-06, "loss": 0.0212, "mean_token_accuracy": 0.9939168155193329, "num_tokens": 9963276.0, "step": 4675 }, { "entropy": 1.7426852703094482, "epoch": 1.5404871626069783, "grad_norm": 1.5778064727783203, "learning_rate": 2.754128857881768e-06, "loss": 0.022, "mean_token_accuracy": 0.9926289975643158, "num_tokens": 9974069.0, "step": 4680 }, { "entropy": 1.7343324899673462, "epoch": 1.5421329822251482, "grad_norm": 0.8580919504165649, "learning_rate": 2.735400229767652e-06, "loss": 0.0208, "mean_token_accuracy": 0.9934548079967499, "num_tokens": 9984603.0, "step": 4685 }, { "entropy": 1.7297178983688355, "epoch": 1.543778801843318, "grad_norm": 1.2886171340942383, "learning_rate": 2.7167254036676183e-06, "loss": 0.0261, "mean_token_accuracy": 0.9902363300323487, "num_tokens": 9995352.0, "step": 4690 }, { "entropy": 1.6793412566184998, "epoch": 1.5454246214614877, "grad_norm": 0.6308950185775757, "learning_rate": 2.698104517888427e-06, "loss": 0.0088, "mean_token_accuracy": 0.9957121074199676, "num_tokens": 10006198.0, "step": 4695 }, { "entropy": 1.712625253200531, "epoch": 1.5470704410796576, "grad_norm": 4.383965969085693, "learning_rate": 2.679537710337352e-06, "loss": 0.0301, "mean_token_accuracy": 0.990826541185379, "num_tokens": 10017039.0, "step": 4700 }, { "entropy": 1.7353214979171754, "epoch": 1.5487162606978275, "grad_norm": 0.49334779381752014, "learning_rate": 2.6610251185211657e-06, "loss": 0.029, "mean_token_accuracy": 0.9952426433563233, "num_tokens": 10027402.0, "step": 4705 }, { "entropy": 1.6911566019058228, "epoch": 1.5503620803159972, "grad_norm": 2.034087896347046, "learning_rate": 2.6425668795451107e-06, "loss": 0.0281, "mean_token_accuracy": 0.9905034959316253, "num_tokens": 10038046.0, "step": 4710 }, { "entropy": 1.69894437789917, "epoch": 1.5520078999341673, "grad_norm": 2.4216017723083496, "learning_rate": 2.624163130111891e-06, "loss": 0.0165, "mean_token_accuracy": 0.9943257391452789, "num_tokens": 10048564.0, "step": 4715 }, { "entropy": 1.7041826963424682, "epoch": 1.553653719552337, "grad_norm": 1.8711634874343872, "learning_rate": 2.605814006520655e-06, "loss": 0.0154, "mean_token_accuracy": 0.9945900738239288, "num_tokens": 10059017.0, "step": 4720 }, { "entropy": 1.6660914540290832, "epoch": 1.555299539170507, "grad_norm": 0.6987809538841248, "learning_rate": 2.587519644666001e-06, "loss": 0.03, "mean_token_accuracy": 0.991539990901947, "num_tokens": 10069792.0, "step": 4725 }, { "entropy": 1.6404382228851317, "epoch": 1.5569453587886768, "grad_norm": 0.9349547028541565, "learning_rate": 2.5692801800369406e-06, "loss": 0.0161, "mean_token_accuracy": 0.9950057923793793, "num_tokens": 10080522.0, "step": 4730 }, { "entropy": 1.6546894431114196, "epoch": 1.5585911784068465, "grad_norm": 2.1536412239074707, "learning_rate": 2.5510957477159257e-06, "loss": 0.0295, "mean_token_accuracy": 0.9922216713428498, "num_tokens": 10091760.0, "step": 4735 }, { "entropy": 1.6477965712547302, "epoch": 1.5602369980250166, "grad_norm": 0.56485915184021, "learning_rate": 2.5329664823778444e-06, "loss": 0.0114, "mean_token_accuracy": 0.995609724521637, "num_tokens": 10102446.0, "step": 4740 }, { "entropy": 1.699891173839569, "epoch": 1.5618828176431863, "grad_norm": 2.4956679344177246, "learning_rate": 2.514892518288988e-06, "loss": 0.0398, "mean_token_accuracy": 0.9890912711620331, "num_tokens": 10113234.0, "step": 4745 }, { "entropy": 1.6791865229606628, "epoch": 1.5635286372613562, "grad_norm": 2.926832914352417, "learning_rate": 2.4968739893061132e-06, "loss": 0.0185, "mean_token_accuracy": 0.9936101496219635, "num_tokens": 10124034.0, "step": 4750 }, { "entropy": 1.6816865921020507, "epoch": 1.565174456879526, "grad_norm": 1.028403401374817, "learning_rate": 2.4789110288754038e-06, "loss": 0.0326, "mean_token_accuracy": 0.9936225533485412, "num_tokens": 10134671.0, "step": 4755 }, { "entropy": 1.7139351129531861, "epoch": 1.5668202764976957, "grad_norm": 4.163893222808838, "learning_rate": 2.461003770031504e-06, "loss": 0.0191, "mean_token_accuracy": 0.9935286998748779, "num_tokens": 10145418.0, "step": 4760 }, { "entropy": 1.7017668962478638, "epoch": 1.5684660961158658, "grad_norm": 2.05292010307312, "learning_rate": 2.4431523453965266e-06, "loss": 0.0295, "mean_token_accuracy": 0.9896788716316223, "num_tokens": 10156080.0, "step": 4765 }, { "entropy": 1.7077699899673462, "epoch": 1.5701119157340355, "grad_norm": 1.2193189859390259, "learning_rate": 2.4253568871790857e-06, "loss": 0.0426, "mean_token_accuracy": 0.9874286651611328, "num_tokens": 10166779.0, "step": 4770 }, { "entropy": 1.7373961925506591, "epoch": 1.5717577353522054, "grad_norm": 0.09195420891046524, "learning_rate": 2.407617527173285e-06, "loss": 0.0234, "mean_token_accuracy": 0.9929518520832061, "num_tokens": 10177151.0, "step": 4775 }, { "entropy": 1.753466045856476, "epoch": 1.5734035549703753, "grad_norm": 1.3904253244400024, "learning_rate": 2.3899343967577803e-06, "loss": 0.0269, "mean_token_accuracy": 0.990022599697113, "num_tokens": 10187902.0, "step": 4780 }, { "entropy": 1.7257111310958861, "epoch": 1.575049374588545, "grad_norm": 1.9471051692962646, "learning_rate": 2.3723076268947777e-06, "loss": 0.0231, "mean_token_accuracy": 0.9913094937801361, "num_tokens": 10198342.0, "step": 4785 }, { "entropy": 1.6994914054870605, "epoch": 1.576695194206715, "grad_norm": 1.1079803705215454, "learning_rate": 2.354737348129077e-06, "loss": 0.0127, "mean_token_accuracy": 0.994962739944458, "num_tokens": 10209017.0, "step": 4790 }, { "entropy": 1.7078461527824402, "epoch": 1.5783410138248848, "grad_norm": 0.8264980316162109, "learning_rate": 2.337223690587098e-06, "loss": 0.019, "mean_token_accuracy": 0.9939144611358642, "num_tokens": 10219775.0, "step": 4795 }, { "entropy": 1.690078580379486, "epoch": 1.5799868334430547, "grad_norm": 1.1476809978485107, "learning_rate": 2.3197667839759307e-06, "loss": 0.0244, "mean_token_accuracy": 0.9934214890003205, "num_tokens": 10230654.0, "step": 4800 }, { "entropy": 1.7367305874824523, "epoch": 1.5816326530612246, "grad_norm": 0.9815186262130737, "learning_rate": 2.302366757582355e-06, "loss": 0.0142, "mean_token_accuracy": 0.9940285742282867, "num_tokens": 10241163.0, "step": 4805 }, { "entropy": 1.7170762538909912, "epoch": 1.5832784726793943, "grad_norm": 2.2614142894744873, "learning_rate": 2.285023740271893e-06, "loss": 0.0193, "mean_token_accuracy": 0.994717925786972, "num_tokens": 10252132.0, "step": 4810 }, { "entropy": 1.6848518610000611, "epoch": 1.5849242922975642, "grad_norm": 0.6454903483390808, "learning_rate": 2.267737860487865e-06, "loss": 0.0182, "mean_token_accuracy": 0.9939591407775878, "num_tokens": 10263052.0, "step": 4815 }, { "entropy": 1.7132264494895935, "epoch": 1.586570111915734, "grad_norm": 1.0455392599105835, "learning_rate": 2.2505092462504153e-06, "loss": 0.0134, "mean_token_accuracy": 0.9961919903755188, "num_tokens": 10273616.0, "step": 4820 }, { "entropy": 1.7440693497657775, "epoch": 1.5882159315339037, "grad_norm": 1.3470888137817383, "learning_rate": 2.2333380251555826e-06, "loss": 0.0279, "mean_token_accuracy": 0.9919908702373504, "num_tokens": 10284093.0, "step": 4825 }, { "entropy": 1.7089533686637879, "epoch": 1.5898617511520738, "grad_norm": 1.1258236169815063, "learning_rate": 2.2162243243743485e-06, "loss": 0.0233, "mean_token_accuracy": 0.9929437875747681, "num_tokens": 10294993.0, "step": 4830 }, { "entropy": 1.677238130569458, "epoch": 1.5915075707702435, "grad_norm": 3.9911301136016846, "learning_rate": 2.1991682706516935e-06, "loss": 0.0294, "mean_token_accuracy": 0.9925793290138245, "num_tokens": 10305763.0, "step": 4835 }, { "entropy": 1.679075539112091, "epoch": 1.5931533903884134, "grad_norm": 2.0645158290863037, "learning_rate": 2.1821699903056627e-06, "loss": 0.028, "mean_token_accuracy": 0.9916234374046325, "num_tokens": 10316288.0, "step": 4840 }, { "entropy": 1.6844815373420716, "epoch": 1.5947992100065833, "grad_norm": 0.7942723631858826, "learning_rate": 2.1652296092264324e-06, "loss": 0.0288, "mean_token_accuracy": 0.9914738476276398, "num_tokens": 10327136.0, "step": 4845 }, { "entropy": 1.681308126449585, "epoch": 1.596445029624753, "grad_norm": 2.3584067821502686, "learning_rate": 2.148347252875368e-06, "loss": 0.0106, "mean_token_accuracy": 0.9962167620658875, "num_tokens": 10337795.0, "step": 4850 }, { "entropy": 1.6970802187919616, "epoch": 1.598090849242923, "grad_norm": 0.7867595553398132, "learning_rate": 2.1315230462840985e-06, "loss": 0.0253, "mean_token_accuracy": 0.9912108719348908, "num_tokens": 10348719.0, "step": 4855 }, { "entropy": 1.706315529346466, "epoch": 1.5997366688610928, "grad_norm": 1.8649603128433228, "learning_rate": 2.114757114053605e-06, "loss": 0.0211, "mean_token_accuracy": 0.992872542142868, "num_tokens": 10359442.0, "step": 4860 }, { "epoch": 1.6010533245556287, "eval_entropy": 1.6943830250834868, "eval_loss": 0.048175811767578125, "eval_mean_token_accuracy": 0.9870549200333505, "eval_num_tokens": 10368037.0, "eval_runtime": 196.4348, "eval_samples_per_second": 42.391, "eval_steps_per_second": 7.066, "step": 4864 }, { "entropy": 1.7008237361907959, "epoch": 1.6013824884792627, "grad_norm": 0.8595287799835205, "learning_rate": 2.098049580353273e-06, "loss": 0.0191, "mean_token_accuracy": 0.9942748367786407, "num_tokens": 10370122.0, "step": 4865 }, { "entropy": 1.697910237312317, "epoch": 1.6030283080974326, "grad_norm": 1.504879355430603, "learning_rate": 2.08140056891999e-06, "loss": 0.0133, "mean_token_accuracy": 0.9958468794822692, "num_tokens": 10380595.0, "step": 4870 }, { "entropy": 1.6834197640419006, "epoch": 1.6046741277156022, "grad_norm": 6.986445903778076, "learning_rate": 2.0648102030572225e-06, "loss": 0.03, "mean_token_accuracy": 0.9888522446155548, "num_tokens": 10391431.0, "step": 4875 }, { "entropy": 1.6846235394477844, "epoch": 1.6063199473337724, "grad_norm": 4.83704137802124, "learning_rate": 2.048278605634113e-06, "loss": 0.0198, "mean_token_accuracy": 0.9933947384357452, "num_tokens": 10402189.0, "step": 4880 }, { "entropy": 1.6738305926322936, "epoch": 1.607965766951942, "grad_norm": 1.9007184505462646, "learning_rate": 2.0318058990845467e-06, "loss": 0.0189, "mean_token_accuracy": 0.9959712147712707, "num_tokens": 10412659.0, "step": 4885 }, { "entropy": 1.668698501586914, "epoch": 1.609611586570112, "grad_norm": 5.049774169921875, "learning_rate": 2.0153922054062758e-06, "loss": 0.0212, "mean_token_accuracy": 0.9936519682407379, "num_tokens": 10423408.0, "step": 4890 }, { "entropy": 1.6653715133666993, "epoch": 1.6112574061882818, "grad_norm": 2.830352306365967, "learning_rate": 1.999037646159989e-06, "loss": 0.0142, "mean_token_accuracy": 0.9954032123088836, "num_tokens": 10434264.0, "step": 4895 }, { "entropy": 1.6867196083068847, "epoch": 1.6129032258064515, "grad_norm": 2.5599348545074463, "learning_rate": 1.9827423424684267e-06, "loss": 0.0332, "mean_token_accuracy": 0.9906599521636963, "num_tokens": 10445083.0, "step": 4900 }, { "entropy": 1.6548929691314698, "epoch": 1.6145490454246214, "grad_norm": 1.4400912523269653, "learning_rate": 1.966506415015477e-06, "loss": 0.0193, "mean_token_accuracy": 0.994045513868332, "num_tokens": 10455748.0, "step": 4905 }, { "entropy": 1.704959547519684, "epoch": 1.6161948650427913, "grad_norm": 1.0404367446899414, "learning_rate": 1.9503299840452927e-06, "loss": 0.0202, "mean_token_accuracy": 0.9917968034744262, "num_tokens": 10466258.0, "step": 4910 }, { "entropy": 1.7084056258201599, "epoch": 1.6178406846609612, "grad_norm": 1.2847148180007935, "learning_rate": 1.9342131693613763e-06, "loss": 0.0245, "mean_token_accuracy": 0.9919559478759765, "num_tokens": 10476806.0, "step": 4915 }, { "entropy": 1.6556385517120362, "epoch": 1.619486504279131, "grad_norm": 2.1901090145111084, "learning_rate": 1.9181560903257234e-06, "loss": 0.0242, "mean_token_accuracy": 0.992319130897522, "num_tokens": 10487881.0, "step": 4920 }, { "entropy": 1.6394746541976928, "epoch": 1.6211323238973008, "grad_norm": 1.6991859674453735, "learning_rate": 1.9021588658579249e-06, "loss": 0.0259, "mean_token_accuracy": 0.9934229254722595, "num_tokens": 10498864.0, "step": 4925 }, { "entropy": 1.7088184237480164, "epoch": 1.6227781435154707, "grad_norm": 1.4522404670715332, "learning_rate": 1.8862216144342692e-06, "loss": 0.0176, "mean_token_accuracy": 0.9922803401947021, "num_tokens": 10509379.0, "step": 4930 }, { "entropy": 1.6745522737503051, "epoch": 1.6244239631336406, "grad_norm": 0.5447269678115845, "learning_rate": 1.870344454086901e-06, "loss": 0.0315, "mean_token_accuracy": 0.9911853075027466, "num_tokens": 10519980.0, "step": 4935 }, { "entropy": 1.6869741559028626, "epoch": 1.6260697827518102, "grad_norm": 1.314532995223999, "learning_rate": 1.8545275024029141e-06, "loss": 0.0194, "mean_token_accuracy": 0.9942934930324554, "num_tokens": 10530726.0, "step": 4940 }, { "entropy": 1.6622373104095458, "epoch": 1.6277156023699804, "grad_norm": 1.3033603429794312, "learning_rate": 1.838770876523498e-06, "loss": 0.0211, "mean_token_accuracy": 0.9951237618923188, "num_tokens": 10541525.0, "step": 4945 }, { "entropy": 1.6855845212936402, "epoch": 1.62936142198815, "grad_norm": 2.0838241577148438, "learning_rate": 1.8230746931430642e-06, "loss": 0.0201, "mean_token_accuracy": 0.9907946407794952, "num_tokens": 10552191.0, "step": 4950 }, { "entropy": 1.6786072969436645, "epoch": 1.63100724160632, "grad_norm": 0.6673282384872437, "learning_rate": 1.807439068508392e-06, "loss": 0.0107, "mean_token_accuracy": 0.9967107653617859, "num_tokens": 10562922.0, "step": 4955 }, { "entropy": 1.6981304287910461, "epoch": 1.6326530612244898, "grad_norm": 0.6075028777122498, "learning_rate": 1.7918641184177444e-06, "loss": 0.0183, "mean_token_accuracy": 0.992760181427002, "num_tokens": 10573352.0, "step": 4960 }, { "entropy": 1.6855650663375854, "epoch": 1.6342988808426595, "grad_norm": 1.076545000076294, "learning_rate": 1.7763499582200405e-06, "loss": 0.023, "mean_token_accuracy": 0.9909692645072937, "num_tokens": 10584337.0, "step": 4965 }, { "entropy": 1.6684273719787597, "epoch": 1.6359447004608296, "grad_norm": 0.9483628869056702, "learning_rate": 1.7608967028139767e-06, "loss": 0.025, "mean_token_accuracy": 0.9940790653228759, "num_tokens": 10595236.0, "step": 4970 }, { "entropy": 1.6795395970344544, "epoch": 1.6375905200789993, "grad_norm": 2.4686009883880615, "learning_rate": 1.7455044666471875e-06, "loss": 0.0407, "mean_token_accuracy": 0.9889774978160858, "num_tokens": 10605904.0, "step": 4975 }, { "entropy": 1.7077094316482544, "epoch": 1.6392363396971692, "grad_norm": 1.020444393157959, "learning_rate": 1.7301733637153994e-06, "loss": 0.0318, "mean_token_accuracy": 0.9912243783473969, "num_tokens": 10616311.0, "step": 4980 }, { "entropy": 1.6756536841392518, "epoch": 1.640882159315339, "grad_norm": 4.183596134185791, "learning_rate": 1.7149035075615795e-06, "loss": 0.0325, "mean_token_accuracy": 0.9907270252704621, "num_tokens": 10627079.0, "step": 4985 }, { "entropy": 1.6901473999023438, "epoch": 1.6425279789335088, "grad_norm": 1.742541790008545, "learning_rate": 1.6996950112750964e-06, "loss": 0.0144, "mean_token_accuracy": 0.9967920780181885, "num_tokens": 10637698.0, "step": 4990 }, { "entropy": 1.665801465511322, "epoch": 1.6441737985516789, "grad_norm": 2.1923062801361084, "learning_rate": 1.6845479874908865e-06, "loss": 0.0236, "mean_token_accuracy": 0.9916431307792664, "num_tokens": 10648267.0, "step": 4995 }, { "entropy": 1.6393486261367798, "epoch": 1.6458196181698486, "grad_norm": 0.6840113401412964, "learning_rate": 1.6694625483886195e-06, "loss": 0.0164, "mean_token_accuracy": 0.9945251405239105, "num_tokens": 10659109.0, "step": 5000 }, { "entropy": 1.6628060221672059, "epoch": 1.6474654377880185, "grad_norm": 2.219745397567749, "learning_rate": 1.6544388056918614e-06, "loss": 0.0254, "mean_token_accuracy": 0.9924133539199829, "num_tokens": 10669828.0, "step": 5005 }, { "entropy": 1.6695847511291504, "epoch": 1.6491112574061884, "grad_norm": 1.5060310363769531, "learning_rate": 1.6394768706672547e-06, "loss": 0.0244, "mean_token_accuracy": 0.9916559994220734, "num_tokens": 10680094.0, "step": 5010 }, { "entropy": 1.659652292728424, "epoch": 1.650757077024358, "grad_norm": 3.3251304626464844, "learning_rate": 1.6245768541236894e-06, "loss": 0.0202, "mean_token_accuracy": 0.9898216426372528, "num_tokens": 10690770.0, "step": 5015 }, { "entropy": 1.6149708151817321, "epoch": 1.652402896642528, "grad_norm": 1.9098771810531616, "learning_rate": 1.6097388664114833e-06, "loss": 0.0147, "mean_token_accuracy": 0.9929858863353729, "num_tokens": 10702024.0, "step": 5020 }, { "entropy": 1.6477410674095154, "epoch": 1.6540487162606978, "grad_norm": 1.96040940284729, "learning_rate": 1.5949630174215647e-06, "loss": 0.0144, "mean_token_accuracy": 0.9963162183761597, "num_tokens": 10712833.0, "step": 5025 }, { "entropy": 1.6466076374053955, "epoch": 1.6556945358788677, "grad_norm": 0.9714366793632507, "learning_rate": 1.580249416584666e-06, "loss": 0.0365, "mean_token_accuracy": 0.9881292939186096, "num_tokens": 10723440.0, "step": 5030 }, { "entropy": 1.669557237625122, "epoch": 1.6573403554970376, "grad_norm": 1.7277604341506958, "learning_rate": 1.5655981728704973e-06, "loss": 0.0239, "mean_token_accuracy": 0.991591501235962, "num_tokens": 10733955.0, "step": 5035 }, { "entropy": 1.6399479150772094, "epoch": 1.6589861751152073, "grad_norm": 5.28700590133667, "learning_rate": 1.5510093947869508e-06, "loss": 0.0201, "mean_token_accuracy": 0.9937320530414582, "num_tokens": 10744942.0, "step": 5040 }, { "entropy": 1.688657033443451, "epoch": 1.6606319947333772, "grad_norm": 6.3327813148498535, "learning_rate": 1.536483190379302e-06, "loss": 0.0227, "mean_token_accuracy": 0.9922419965267182, "num_tokens": 10755594.0, "step": 5045 }, { "entropy": 1.6918665885925293, "epoch": 1.662277814351547, "grad_norm": 0.1568760871887207, "learning_rate": 1.522019667229393e-06, "loss": 0.009, "mean_token_accuracy": 0.9979844152927398, "num_tokens": 10766398.0, "step": 5050 }, { "entropy": 1.6314533591270446, "epoch": 1.6639236339697168, "grad_norm": 1.880570411682129, "learning_rate": 1.5076189324548506e-06, "loss": 0.0203, "mean_token_accuracy": 0.9929390490055084, "num_tokens": 10777255.0, "step": 5055 }, { "entropy": 1.6015793800354003, "epoch": 1.6655694535878869, "grad_norm": 1.6532872915267944, "learning_rate": 1.493281092708283e-06, "loss": 0.0153, "mean_token_accuracy": 0.9948068380355835, "num_tokens": 10788343.0, "step": 5060 }, { "entropy": 1.6586069226264955, "epoch": 1.6672152732060566, "grad_norm": 1.2856965065002441, "learning_rate": 1.479006254176505e-06, "loss": 0.0203, "mean_token_accuracy": 0.992956918478012, "num_tokens": 10799130.0, "step": 5065 }, { "entropy": 1.6501345872879027, "epoch": 1.6688610928242265, "grad_norm": 2.5434141159057617, "learning_rate": 1.4647945225797244e-06, "loss": 0.0261, "mean_token_accuracy": 0.9912129878997803, "num_tokens": 10809881.0, "step": 5070 }, { "entropy": 1.614508295059204, "epoch": 1.6705069124423964, "grad_norm": 0.4630468189716339, "learning_rate": 1.4506460031707903e-06, "loss": 0.0148, "mean_token_accuracy": 0.9954249680042266, "num_tokens": 10820794.0, "step": 5075 }, { "entropy": 1.645851767063141, "epoch": 1.672152732060566, "grad_norm": 2.122525930404663, "learning_rate": 1.4365608007343922e-06, "loss": 0.026, "mean_token_accuracy": 0.9927963316440582, "num_tokens": 10831234.0, "step": 5080 }, { "entropy": 1.6439926147460937, "epoch": 1.6737985516787361, "grad_norm": 0.7459424138069153, "learning_rate": 1.4225390195862932e-06, "loss": 0.0241, "mean_token_accuracy": 0.9936447679996491, "num_tokens": 10842096.0, "step": 5085 }, { "entropy": 1.6597126364707946, "epoch": 1.6754443712969058, "grad_norm": 1.7774500846862793, "learning_rate": 1.4085807635725491e-06, "loss": 0.0235, "mean_token_accuracy": 0.9914094746112824, "num_tokens": 10852586.0, "step": 5090 }, { "entropy": 1.7044395089149476, "epoch": 1.6770901909150757, "grad_norm": 1.1517224311828613, "learning_rate": 1.3946861360687548e-06, "loss": 0.0174, "mean_token_accuracy": 0.9941137075424195, "num_tokens": 10862934.0, "step": 5095 }, { "entropy": 1.6369341015815735, "epoch": 1.6787360105332456, "grad_norm": 0.2626267671585083, "learning_rate": 1.380855239979264e-06, "loss": 0.017, "mean_token_accuracy": 0.99591423869133, "num_tokens": 10873852.0, "step": 5100 }, { "entropy": 1.6440617442131042, "epoch": 1.6803818301514153, "grad_norm": 2.385711193084717, "learning_rate": 1.3670881777364276e-06, "loss": 0.0389, "mean_token_accuracy": 0.9895196318626404, "num_tokens": 10884490.0, "step": 5105 }, { "entropy": 1.6631293177604676, "epoch": 1.6820276497695854, "grad_norm": 2.767446994781494, "learning_rate": 1.3533850512998515e-06, "loss": 0.043, "mean_token_accuracy": 0.9870490610599518, "num_tokens": 10894958.0, "step": 5110 }, { "entropy": 1.671304202079773, "epoch": 1.683673469387755, "grad_norm": 1.706010103225708, "learning_rate": 1.339745962155613e-06, "loss": 0.0214, "mean_token_accuracy": 0.9934303402900696, "num_tokens": 10905373.0, "step": 5115 }, { "entropy": 1.652925717830658, "epoch": 1.685319289005925, "grad_norm": 1.8445219993591309, "learning_rate": 1.3261710113155436e-06, "loss": 0.0286, "mean_token_accuracy": 0.9917166590690613, "num_tokens": 10916144.0, "step": 5120 }, { "entropy": 1.6853961229324341, "epoch": 1.6869651086240949, "grad_norm": 0.4994262754917145, "learning_rate": 1.3126602993164505e-06, "loss": 0.0155, "mean_token_accuracy": 0.9949258744716645, "num_tokens": 10926960.0, "step": 5125 }, { "entropy": 1.6451457619667054, "epoch": 1.6886109282422646, "grad_norm": 0.9345157742500305, "learning_rate": 1.2992139262193893e-06, "loss": 0.0185, "mean_token_accuracy": 0.9947284281253814, "num_tokens": 10937405.0, "step": 5130 }, { "entropy": 1.624656081199646, "epoch": 1.6902567478604344, "grad_norm": 1.3270047903060913, "learning_rate": 1.2858319916089156e-06, "loss": 0.0168, "mean_token_accuracy": 0.99341681599617, "num_tokens": 10948172.0, "step": 5135 }, { "entropy": 1.659364914894104, "epoch": 1.6919025674786043, "grad_norm": 0.4350810647010803, "learning_rate": 1.2725145945923588e-06, "loss": 0.0183, "mean_token_accuracy": 0.99389089345932, "num_tokens": 10958763.0, "step": 5140 }, { "entropy": 1.6685426354408264, "epoch": 1.6935483870967742, "grad_norm": 1.704720377922058, "learning_rate": 1.2592618337990647e-06, "loss": 0.0285, "mean_token_accuracy": 0.9913337886333465, "num_tokens": 10969305.0, "step": 5145 }, { "entropy": 1.6044569730758667, "epoch": 1.6951942067149441, "grad_norm": 0.8705942034721375, "learning_rate": 1.2460738073796929e-06, "loss": 0.0238, "mean_token_accuracy": 0.9927083373069763, "num_tokens": 10980236.0, "step": 5150 }, { "entropy": 1.685863721370697, "epoch": 1.6968400263331138, "grad_norm": 0.5871350765228271, "learning_rate": 1.2329506130054703e-06, "loss": 0.0187, "mean_token_accuracy": 0.9933398306369782, "num_tokens": 10990582.0, "step": 5155 }, { "entropy": 1.6742502808570863, "epoch": 1.6984858459512837, "grad_norm": 2.4911105632781982, "learning_rate": 1.219892347867474e-06, "loss": 0.0298, "mean_token_accuracy": 0.9904028534889221, "num_tokens": 11000966.0, "step": 5160 }, { "entropy": 1.6669535398483277, "epoch": 1.7001316655694536, "grad_norm": 1.9511528015136719, "learning_rate": 1.2068991086759175e-06, "loss": 0.0377, "mean_token_accuracy": 0.9925346195697784, "num_tokens": 11011630.0, "step": 5165 }, { "entropy": 1.6653909087181091, "epoch": 1.7017774851876233, "grad_norm": 0.7113050222396851, "learning_rate": 1.1939709916594222e-06, "loss": 0.0297, "mean_token_accuracy": 0.9918692409992218, "num_tokens": 11022348.0, "step": 5170 }, { "entropy": 1.6599775552749634, "epoch": 1.7034233048057934, "grad_norm": 2.2692055702209473, "learning_rate": 1.1811080925643125e-06, "loss": 0.0344, "mean_token_accuracy": 0.9922800958156586, "num_tokens": 11032982.0, "step": 5175 }, { "entropy": 1.6723967909812927, "epoch": 1.705069124423963, "grad_norm": 1.5152289867401123, "learning_rate": 1.1683105066539068e-06, "loss": 0.0172, "mean_token_accuracy": 0.9953593909740448, "num_tokens": 11043325.0, "step": 5180 }, { "entropy": 1.6335657358169555, "epoch": 1.706714944042133, "grad_norm": 1.356180191040039, "learning_rate": 1.1555783287078116e-06, "loss": 0.0215, "mean_token_accuracy": 0.9901393771171569, "num_tokens": 11054285.0, "step": 5185 }, { "entropy": 1.6311080694198608, "epoch": 1.7083607636603029, "grad_norm": 1.751895785331726, "learning_rate": 1.142911653021217e-06, "loss": 0.0163, "mean_token_accuracy": 0.994689530134201, "num_tokens": 11064881.0, "step": 5190 }, { "entropy": 1.6046368837356568, "epoch": 1.7100065832784725, "grad_norm": 0.38154032826423645, "learning_rate": 1.1303105734041996e-06, "loss": 0.0181, "mean_token_accuracy": 0.9927674353122711, "num_tokens": 11076136.0, "step": 5195 }, { "entropy": 1.6612093567848205, "epoch": 1.7116524028966427, "grad_norm": 0.979333221912384, "learning_rate": 1.1177751831810279e-06, "loss": 0.0335, "mean_token_accuracy": 0.9923636972904205, "num_tokens": 11086778.0, "step": 5200 }, { "entropy": 1.6807520270347596, "epoch": 1.7132982225148123, "grad_norm": 1.7828707695007324, "learning_rate": 1.1053055751894726e-06, "loss": 0.0277, "mean_token_accuracy": 0.993100905418396, "num_tokens": 11097370.0, "step": 5205 }, { "entropy": 1.6403346300125121, "epoch": 1.7149440421329822, "grad_norm": 2.5002477169036865, "learning_rate": 1.0929018417801129e-06, "loss": 0.017, "mean_token_accuracy": 0.9920848309993744, "num_tokens": 11108166.0, "step": 5210 }, { "entropy": 1.6517832636833192, "epoch": 1.7165898617511521, "grad_norm": 1.3457746505737305, "learning_rate": 1.0805640748156675e-06, "loss": 0.0213, "mean_token_accuracy": 0.9931346833705902, "num_tokens": 11118902.0, "step": 5215 }, { "entropy": 1.6854795217514038, "epoch": 1.7182356813693218, "grad_norm": 0.07055312395095825, "learning_rate": 1.0682923656702948e-06, "loss": 0.0211, "mean_token_accuracy": 0.9933703184127808, "num_tokens": 11129409.0, "step": 5220 }, { "entropy": 1.7007460117340087, "epoch": 1.719881500987492, "grad_norm": 1.2991859912872314, "learning_rate": 1.0560868052289253e-06, "loss": 0.0244, "mean_token_accuracy": 0.9902635872364044, "num_tokens": 11140079.0, "step": 5225 }, { "entropy": 1.6376017451286315, "epoch": 1.7215273206056616, "grad_norm": 0.83013516664505, "learning_rate": 1.0439474838865981e-06, "loss": 0.013, "mean_token_accuracy": 0.9975570976734162, "num_tokens": 11150683.0, "step": 5230 }, { "entropy": 1.6345697283744811, "epoch": 1.7231731402238315, "grad_norm": 3.94348406791687, "learning_rate": 1.031874491547773e-06, "loss": 0.0189, "mean_token_accuracy": 0.9938337445259094, "num_tokens": 11161334.0, "step": 5235 }, { "entropy": 1.648432207107544, "epoch": 1.7248189598420014, "grad_norm": 1.0848654508590698, "learning_rate": 1.0198679176256742e-06, "loss": 0.0287, "mean_token_accuracy": 0.9918507993221283, "num_tokens": 11171950.0, "step": 5240 }, { "entropy": 1.660215425491333, "epoch": 1.726464779460171, "grad_norm": 1.0720781087875366, "learning_rate": 1.0079278510416313e-06, "loss": 0.0297, "mean_token_accuracy": 0.9905134677886963, "num_tokens": 11182536.0, "step": 5245 }, { "entropy": 1.6251054883003235, "epoch": 1.728110599078341, "grad_norm": 2.8627305030822754, "learning_rate": 9.960543802244195e-07, "loss": 0.0214, "mean_token_accuracy": 0.991985023021698, "num_tokens": 11193429.0, "step": 5250 }, { "entropy": 1.668004846572876, "epoch": 1.7297564186965109, "grad_norm": 0.5310274958610535, "learning_rate": 9.842475931095895e-07, "loss": 0.0398, "mean_token_accuracy": 0.9901407361030579, "num_tokens": 11204189.0, "step": 5255 }, { "entropy": 1.6230967044830322, "epoch": 1.7314022383146808, "grad_norm": 1.8132059574127197, "learning_rate": 9.725075771388449e-07, "loss": 0.0181, "mean_token_accuracy": 0.9925452589988708, "num_tokens": 11214880.0, "step": 5260 }, { "entropy": 1.629646909236908, "epoch": 1.7330480579328507, "grad_norm": 0.9202048182487488, "learning_rate": 9.60834419259369e-07, "loss": 0.0242, "mean_token_accuracy": 0.9919378876686096, "num_tokens": 11225689.0, "step": 5265 }, { "entropy": 1.6248352885246278, "epoch": 1.7346938775510203, "grad_norm": 0.7040166258811951, "learning_rate": 9.492282059231917e-07, "loss": 0.0173, "mean_token_accuracy": 0.9930191397666931, "num_tokens": 11236662.0, "step": 5270 }, { "entropy": 1.6555840373039246, "epoch": 1.7363396971691902, "grad_norm": 0.31907936930656433, "learning_rate": 9.376890230865487e-07, "loss": 0.0105, "mean_token_accuracy": 0.9968835175037384, "num_tokens": 11247227.0, "step": 5275 }, { "entropy": 1.6610607862472535, "epoch": 1.7379855167873601, "grad_norm": 1.0085439682006836, "learning_rate": 9.262169562092483e-07, "loss": 0.0167, "mean_token_accuracy": 0.9929597020149231, "num_tokens": 11258079.0, "step": 5280 }, { "entropy": 1.6653624534606934, "epoch": 1.7396313364055298, "grad_norm": 1.6425352096557617, "learning_rate": 9.148120902540281e-07, "loss": 0.0244, "mean_token_accuracy": 0.9898296535015106, "num_tokens": 11268656.0, "step": 5285 }, { "entropy": 1.6512943148612975, "epoch": 1.7412771560237, "grad_norm": 1.9848991632461548, "learning_rate": 9.034745096859332e-07, "loss": 0.0165, "mean_token_accuracy": 0.9939022302627564, "num_tokens": 11279189.0, "step": 5290 }, { "entropy": 1.633478820323944, "epoch": 1.7429229756418696, "grad_norm": 1.188881278038025, "learning_rate": 8.922042984716972e-07, "loss": 0.0211, "mean_token_accuracy": 0.9940697014331817, "num_tokens": 11289941.0, "step": 5295 }, { "entropy": 1.6428375720977784, "epoch": 1.7445687952600395, "grad_norm": 1.3060122728347778, "learning_rate": 8.810015400790994e-07, "loss": 0.0146, "mean_token_accuracy": 0.9937053561210633, "num_tokens": 11300919.0, "step": 5300 }, { "entropy": 1.644476592540741, "epoch": 1.7462146148782094, "grad_norm": 4.126039981842041, "learning_rate": 8.69866317476371e-07, "loss": 0.0436, "mean_token_accuracy": 0.9879814326763153, "num_tokens": 11311626.0, "step": 5305 }, { "entropy": 1.6524176239967345, "epoch": 1.747860434496379, "grad_norm": 0.9086925387382507, "learning_rate": 8.587987131315656e-07, "loss": 0.0384, "mean_token_accuracy": 0.9912387907505036, "num_tokens": 11322432.0, "step": 5310 }, { "entropy": 1.6595850586891174, "epoch": 1.7495062541145492, "grad_norm": 1.3107917308807373, "learning_rate": 8.477988090119515e-07, "loss": 0.0209, "mean_token_accuracy": 0.9909784734249115, "num_tokens": 11333086.0, "step": 5315 }, { "entropy": 1.6489954590797424, "epoch": 1.7511520737327189, "grad_norm": 0.6780989766120911, "learning_rate": 8.36866686583404e-07, "loss": 0.013, "mean_token_accuracy": 0.9960228025913238, "num_tokens": 11343698.0, "step": 5320 }, { "entropy": 1.6577156782150269, "epoch": 1.7527978933508888, "grad_norm": 2.5030269622802734, "learning_rate": 8.260024268098121e-07, "loss": 0.0407, "mean_token_accuracy": 0.9897329092025757, "num_tokens": 11354384.0, "step": 5325 }, { "entropy": 1.6797374844551087, "epoch": 1.7544437129690587, "grad_norm": 1.1785377264022827, "learning_rate": 8.152061101524578e-07, "loss": 0.0204, "mean_token_accuracy": 0.992442113161087, "num_tokens": 11364888.0, "step": 5330 }, { "entropy": 1.6517166137695312, "epoch": 1.7560895325872283, "grad_norm": 2.3063104152679443, "learning_rate": 8.044778165694434e-07, "loss": 0.0249, "mean_token_accuracy": 0.9921338021755218, "num_tokens": 11375641.0, "step": 5335 }, { "entropy": 1.644195032119751, "epoch": 1.7577353522053984, "grad_norm": 2.3999085426330566, "learning_rate": 7.93817625515082e-07, "loss": 0.0175, "mean_token_accuracy": 0.9942924916744232, "num_tokens": 11386312.0, "step": 5340 }, { "entropy": 1.6763506770133971, "epoch": 1.7593811718235681, "grad_norm": 2.3939199447631836, "learning_rate": 7.832256159393181e-07, "loss": 0.0242, "mean_token_accuracy": 0.990136843919754, "num_tokens": 11396648.0, "step": 5345 }, { "entropy": 1.7076772809028626, "epoch": 1.761026991441738, "grad_norm": 4.3696184158325195, "learning_rate": 7.727018662871432e-07, "loss": 0.0236, "mean_token_accuracy": 0.9929444313049316, "num_tokens": 11407191.0, "step": 5350 }, { "entropy": 1.6799395561218262, "epoch": 1.762672811059908, "grad_norm": 1.7837049961090088, "learning_rate": 7.62246454498009e-07, "loss": 0.0212, "mean_token_accuracy": 0.993122935295105, "num_tokens": 11417870.0, "step": 5355 }, { "entropy": 1.6892127633094787, "epoch": 1.7643186306780776, "grad_norm": 1.9029659032821655, "learning_rate": 7.518594580052519e-07, "loss": 0.0253, "mean_token_accuracy": 0.9921809673309326, "num_tokens": 11428181.0, "step": 5360 }, { "entropy": 1.6493794798851014, "epoch": 1.7659644502962475, "grad_norm": 0.7186495065689087, "learning_rate": 7.415409537355222e-07, "loss": 0.016, "mean_token_accuracy": 0.9955087423324585, "num_tokens": 11439027.0, "step": 5365 }, { "entropy": 1.668405246734619, "epoch": 1.7676102699144174, "grad_norm": 0.64583420753479, "learning_rate": 7.312910181082178e-07, "loss": 0.0149, "mean_token_accuracy": 0.9964608550071716, "num_tokens": 11449456.0, "step": 5370 }, { "entropy": 1.6470329999923705, "epoch": 1.7692560895325873, "grad_norm": 0.6577222347259521, "learning_rate": 7.211097270349065e-07, "loss": 0.0128, "mean_token_accuracy": 0.9961907029151916, "num_tokens": 11459922.0, "step": 5375 }, { "entropy": 1.6427839875221253, "epoch": 1.7709019091507572, "grad_norm": 1.2375580072402954, "learning_rate": 7.109971559187767e-07, "loss": 0.0251, "mean_token_accuracy": 0.9925231218338013, "num_tokens": 11470923.0, "step": 5380 }, { "entropy": 1.6566937565803528, "epoch": 1.7725477287689269, "grad_norm": 2.0263426303863525, "learning_rate": 7.00953379654068e-07, "loss": 0.012, "mean_token_accuracy": 0.9966720283031464, "num_tokens": 11481319.0, "step": 5385 }, { "entropy": 1.647239327430725, "epoch": 1.7741935483870968, "grad_norm": 0.439744770526886, "learning_rate": 6.909784726255242e-07, "loss": 0.0136, "mean_token_accuracy": 0.9953515648841857, "num_tokens": 11491892.0, "step": 5390 }, { "entropy": 1.653624951839447, "epoch": 1.7758393680052666, "grad_norm": 2.360499620437622, "learning_rate": 6.810725087078395e-07, "loss": 0.0177, "mean_token_accuracy": 0.9931002259254456, "num_tokens": 11502606.0, "step": 5395 }, { "entropy": 1.6672588109970092, "epoch": 1.7774851876234363, "grad_norm": 1.195021390914917, "learning_rate": 6.712355612651145e-07, "loss": 0.0175, "mean_token_accuracy": 0.9928203105926514, "num_tokens": 11513121.0, "step": 5400 }, { "entropy": 1.6415475726127624, "epoch": 1.7791310072416064, "grad_norm": 3.7338359355926514, "learning_rate": 6.614677031503059e-07, "loss": 0.0386, "mean_token_accuracy": 0.9931011736392975, "num_tokens": 11523757.0, "step": 5405 }, { "entropy": 1.630179488658905, "epoch": 1.7807768268597761, "grad_norm": 2.090383768081665, "learning_rate": 6.517690067046922e-07, "loss": 0.0193, "mean_token_accuracy": 0.9923229575157165, "num_tokens": 11534852.0, "step": 5410 }, { "entropy": 1.6325619697570801, "epoch": 1.782422646477946, "grad_norm": 0.8092725872993469, "learning_rate": 6.421395437573386e-07, "loss": 0.0219, "mean_token_accuracy": 0.9927444994449616, "num_tokens": 11545588.0, "step": 5415 }, { "entropy": 1.6684589862823487, "epoch": 1.784068466096116, "grad_norm": 1.0808240175247192, "learning_rate": 6.325793856245632e-07, "loss": 0.0084, "mean_token_accuracy": 0.9980384469032287, "num_tokens": 11555985.0, "step": 5420 }, { "entropy": 1.676847243309021, "epoch": 1.7857142857142856, "grad_norm": 1.1335194110870361, "learning_rate": 6.230886031094063e-07, "loss": 0.0168, "mean_token_accuracy": 0.9945612967014312, "num_tokens": 11566484.0, "step": 5425 }, { "entropy": 1.6188385486602783, "epoch": 1.7873601053324557, "grad_norm": 1.116150140762329, "learning_rate": 6.136672665011089e-07, "loss": 0.0198, "mean_token_accuracy": 0.9933637619018555, "num_tokens": 11577115.0, "step": 5430 }, { "entropy": 1.6665136098861695, "epoch": 1.7890059249506254, "grad_norm": 1.42592191696167, "learning_rate": 6.043154455745981e-07, "loss": 0.0205, "mean_token_accuracy": 0.9911065042018891, "num_tokens": 11587678.0, "step": 5435 }, { "entropy": 1.642556118965149, "epoch": 1.7906517445687953, "grad_norm": 1.1183552742004395, "learning_rate": 5.950332095899547e-07, "loss": 0.0191, "mean_token_accuracy": 0.9941525161266327, "num_tokens": 11598261.0, "step": 5440 }, { "entropy": 1.6583900451660156, "epoch": 1.7922975641869652, "grad_norm": 1.2433513402938843, "learning_rate": 5.858206272919165e-07, "loss": 0.0263, "mean_token_accuracy": 0.9929942131042481, "num_tokens": 11609100.0, "step": 5445 }, { "entropy": 1.630812132358551, "epoch": 1.7939433838051349, "grad_norm": 3.484126567840576, "learning_rate": 5.766777669093604e-07, "loss": 0.032, "mean_token_accuracy": 0.9925698578357697, "num_tokens": 11619934.0, "step": 5450 }, { "entropy": 1.595421350002289, "epoch": 1.795589203423305, "grad_norm": 0.8385356664657593, "learning_rate": 5.676046961547987e-07, "loss": 0.0335, "mean_token_accuracy": 0.9886408507823944, "num_tokens": 11630717.0, "step": 5455 }, { "entropy": 1.6572367787361144, "epoch": 1.7972350230414746, "grad_norm": 0.9135512113571167, "learning_rate": 5.586014822238772e-07, "loss": 0.0208, "mean_token_accuracy": 0.9933992445468902, "num_tokens": 11641288.0, "step": 5460 }, { "entropy": 1.5970224499702455, "epoch": 1.7988808426596445, "grad_norm": 1.7872576713562012, "learning_rate": 5.496681917948809e-07, "loss": 0.0154, "mean_token_accuracy": 0.9937366545200348, "num_tokens": 11652159.0, "step": 5465 }, { "entropy": 1.632554793357849, "epoch": 1.8005266622778144, "grad_norm": 0.10970994830131531, "learning_rate": 5.408048910282348e-07, "loss": 0.032, "mean_token_accuracy": 0.9962066173553467, "num_tokens": 11662941.0, "step": 5470 }, { "epoch": 1.8011849901250823, "eval_entropy": 1.6544960326694962, "eval_loss": 0.04707782715559006, "eval_mean_token_accuracy": 0.9874065164100884, "eval_num_tokens": 11667189.0, "eval_runtime": 196.3646, "eval_samples_per_second": 42.406, "eval_steps_per_second": 7.068, "step": 5472 }, { "entropy": 1.6609861731529236, "epoch": 1.8021724818959841, "grad_norm": 1.7392816543579102, "learning_rate": 5.320116455660185e-07, "loss": 0.0206, "mean_token_accuracy": 0.9918794929981232, "num_tokens": 11673648.0, "step": 5475 }, { "entropy": 1.6410995721817017, "epoch": 1.803818301514154, "grad_norm": 1.1604136228561401, "learning_rate": 5.232885205314797e-07, "loss": 0.032, "mean_token_accuracy": 0.9902990460395813, "num_tokens": 11684381.0, "step": 5480 }, { "entropy": 1.6263534784317017, "epoch": 1.805464121132324, "grad_norm": 1.2235734462738037, "learning_rate": 5.146355805285452e-07, "loss": 0.0198, "mean_token_accuracy": 0.994633013010025, "num_tokens": 11694890.0, "step": 5485 }, { "entropy": 1.642207968235016, "epoch": 1.8071099407504936, "grad_norm": 3.1930456161499023, "learning_rate": 5.06052889641353e-07, "loss": 0.0133, "mean_token_accuracy": 0.994309377670288, "num_tokens": 11705533.0, "step": 5490 }, { "entropy": 1.6500659108161926, "epoch": 1.8087557603686637, "grad_norm": 3.067206621170044, "learning_rate": 4.975405114337695e-07, "loss": 0.0303, "mean_token_accuracy": 0.9904295146465302, "num_tokens": 11716171.0, "step": 5495 }, { "entropy": 1.636317217350006, "epoch": 1.8104015799868334, "grad_norm": 0.6358233094215393, "learning_rate": 4.890985089489231e-07, "loss": 0.0135, "mean_token_accuracy": 0.9961594045162201, "num_tokens": 11726724.0, "step": 5500 }, { "entropy": 1.6213058233261108, "epoch": 1.8120473996050033, "grad_norm": 1.792466402053833, "learning_rate": 4.807269447087348e-07, "loss": 0.019, "mean_token_accuracy": 0.9931978344917297, "num_tokens": 11737311.0, "step": 5505 }, { "entropy": 1.6493823170661925, "epoch": 1.8136932192231732, "grad_norm": 1.9440345764160156, "learning_rate": 4.7242588071345965e-07, "loss": 0.0278, "mean_token_accuracy": 0.9932690799236298, "num_tokens": 11747961.0, "step": 5510 }, { "entropy": 1.6495756149291991, "epoch": 1.8153390388413428, "grad_norm": 1.2750672101974487, "learning_rate": 4.6419537844121565e-07, "loss": 0.0189, "mean_token_accuracy": 0.9953626215457916, "num_tokens": 11758583.0, "step": 5515 }, { "entropy": 1.6222103834152222, "epoch": 1.816984858459513, "grad_norm": 2.3976049423217773, "learning_rate": 4.5603549884754463e-07, "loss": 0.0208, "mean_token_accuracy": 0.9919763743877411, "num_tokens": 11769636.0, "step": 5520 }, { "entropy": 1.6041205644607544, "epoch": 1.8186306780776826, "grad_norm": 1.3786771297454834, "learning_rate": 4.479463023649555e-07, "loss": 0.0127, "mean_token_accuracy": 0.9947966277599335, "num_tokens": 11780222.0, "step": 5525 }, { "entropy": 1.657669985294342, "epoch": 1.8202764976958525, "grad_norm": 3.0026445388793945, "learning_rate": 4.3992784890246276e-07, "loss": 0.0108, "mean_token_accuracy": 0.9966068983078002, "num_tokens": 11790566.0, "step": 5530 }, { "entropy": 1.6583294034004212, "epoch": 1.8219223173140224, "grad_norm": 2.0243186950683594, "learning_rate": 4.319801978451654e-07, "loss": 0.0269, "mean_token_accuracy": 0.9926669538021088, "num_tokens": 11801065.0, "step": 5535 }, { "entropy": 1.6717671513557435, "epoch": 1.823568136932192, "grad_norm": 0.7521846294403076, "learning_rate": 4.241034080537909e-07, "loss": 0.0156, "mean_token_accuracy": 0.9947370052337646, "num_tokens": 11811699.0, "step": 5540 }, { "entropy": 1.657283067703247, "epoch": 1.8252139565503622, "grad_norm": 2.917626142501831, "learning_rate": 4.162975378642653e-07, "loss": 0.0252, "mean_token_accuracy": 0.9930805206298828, "num_tokens": 11822213.0, "step": 5545 }, { "entropy": 1.6172433972358704, "epoch": 1.826859776168532, "grad_norm": 0.5599117875099182, "learning_rate": 4.085626450872782e-07, "loss": 0.0151, "mean_token_accuracy": 0.9949965596199035, "num_tokens": 11833106.0, "step": 5550 }, { "entropy": 1.6172014594078064, "epoch": 1.8285055957867018, "grad_norm": 1.6733261346817017, "learning_rate": 4.008987870078629e-07, "loss": 0.0168, "mean_token_accuracy": 0.9940386712551117, "num_tokens": 11843679.0, "step": 5555 }, { "entropy": 1.6307536125183106, "epoch": 1.8301514154048717, "grad_norm": 2.103327512741089, "learning_rate": 3.9330602038495925e-07, "loss": 0.0234, "mean_token_accuracy": 0.9932344555854797, "num_tokens": 11854343.0, "step": 5560 }, { "entropy": 1.6388854265213013, "epoch": 1.8317972350230414, "grad_norm": 0.9723978638648987, "learning_rate": 3.8578440145100373e-07, "loss": 0.0197, "mean_token_accuracy": 0.9940811336040497, "num_tokens": 11864727.0, "step": 5565 }, { "entropy": 1.6083611011505128, "epoch": 1.8334430546412115, "grad_norm": 1.644818902015686, "learning_rate": 3.783339859115065e-07, "loss": 0.0251, "mean_token_accuracy": 0.990217787027359, "num_tokens": 11875655.0, "step": 5570 }, { "entropy": 1.650509774684906, "epoch": 1.8350888742593812, "grad_norm": 4.321114540100098, "learning_rate": 3.709548289446452e-07, "loss": 0.0359, "mean_token_accuracy": 0.9882233738899231, "num_tokens": 11886276.0, "step": 5575 }, { "entropy": 1.6342867493629456, "epoch": 1.836734693877551, "grad_norm": 0.9216455221176147, "learning_rate": 3.636469852008473e-07, "loss": 0.0163, "mean_token_accuracy": 0.9964061677455902, "num_tokens": 11896970.0, "step": 5580 }, { "entropy": 1.6529955506324767, "epoch": 1.838380513495721, "grad_norm": 1.3248862028121948, "learning_rate": 3.564105088023984e-07, "loss": 0.0174, "mean_token_accuracy": 0.9947122275829315, "num_tokens": 11907415.0, "step": 5585 }, { "entropy": 1.6415375351905823, "epoch": 1.8400263331138906, "grad_norm": 1.5896852016448975, "learning_rate": 3.4924545334302675e-07, "loss": 0.0124, "mean_token_accuracy": 0.9948625266551971, "num_tokens": 11918125.0, "step": 5590 }, { "entropy": 1.6392775058746338, "epoch": 1.8416721527320605, "grad_norm": 0.6162107586860657, "learning_rate": 3.421518718875161e-07, "loss": 0.0204, "mean_token_accuracy": 0.993281239271164, "num_tokens": 11928566.0, "step": 5595 }, { "entropy": 1.6502568006515503, "epoch": 1.8433179723502304, "grad_norm": 0.8682472109794617, "learning_rate": 3.351298169713102e-07, "loss": 0.0241, "mean_token_accuracy": 0.992335319519043, "num_tokens": 11939271.0, "step": 5600 }, { "entropy": 1.6440783381462096, "epoch": 1.8449637919684, "grad_norm": 1.3868647813796997, "learning_rate": 3.281793406001232e-07, "loss": 0.0284, "mean_token_accuracy": 0.9914430260658265, "num_tokens": 11949781.0, "step": 5605 }, { "entropy": 1.6473740935325623, "epoch": 1.8466096115865702, "grad_norm": 1.2047520875930786, "learning_rate": 3.213004942495546e-07, "loss": 0.022, "mean_token_accuracy": 0.9923123598098755, "num_tokens": 11960650.0, "step": 5610 }, { "entropy": 1.6165038704872132, "epoch": 1.84825543120474, "grad_norm": 1.3083264827728271, "learning_rate": 3.144933288647067e-07, "loss": 0.01, "mean_token_accuracy": 0.9968856811523438, "num_tokens": 11971426.0, "step": 5615 }, { "entropy": 1.6375835180282592, "epoch": 1.8499012508229098, "grad_norm": 2.6903843879699707, "learning_rate": 3.0775789485981254e-07, "loss": 0.0134, "mean_token_accuracy": 0.9949893295764923, "num_tokens": 11982373.0, "step": 5620 }, { "entropy": 1.6399108648300171, "epoch": 1.8515470704410797, "grad_norm": 0.4139382839202881, "learning_rate": 3.010942421178531e-07, "loss": 0.0415, "mean_token_accuracy": 0.9894947111606598, "num_tokens": 11993082.0, "step": 5625 }, { "entropy": 1.6450653195381164, "epoch": 1.8531928900592494, "grad_norm": 1.1827541589736938, "learning_rate": 2.9450241999020024e-07, "loss": 0.0166, "mean_token_accuracy": 0.9943609952926635, "num_tokens": 12003839.0, "step": 5630 }, { "entropy": 1.6487621068954468, "epoch": 1.8548387096774195, "grad_norm": 0.9702763557434082, "learning_rate": 2.879824772962381e-07, "loss": 0.0148, "mean_token_accuracy": 0.9935874819755555, "num_tokens": 12014594.0, "step": 5635 }, { "entropy": 1.6262930750846862, "epoch": 1.8564845292955892, "grad_norm": 0.6445523500442505, "learning_rate": 2.81534462323011e-07, "loss": 0.0182, "mean_token_accuracy": 0.9919591426849366, "num_tokens": 12025486.0, "step": 5640 }, { "entropy": 1.6225888967514037, "epoch": 1.858130348913759, "grad_norm": 1.6470600366592407, "learning_rate": 2.7515842282486274e-07, "loss": 0.0263, "mean_token_accuracy": 0.9922280848026276, "num_tokens": 12035884.0, "step": 5645 }, { "entropy": 1.637891697883606, "epoch": 1.859776168531929, "grad_norm": 0.8527348637580872, "learning_rate": 2.688544060230835e-07, "loss": 0.0201, "mean_token_accuracy": 0.9935072481632232, "num_tokens": 12046847.0, "step": 5650 }, { "entropy": 1.6528636813163757, "epoch": 1.8614219881500986, "grad_norm": 0.8655990362167358, "learning_rate": 2.626224586055581e-07, "loss": 0.0191, "mean_token_accuracy": 0.9932506561279297, "num_tokens": 12057498.0, "step": 5655 }, { "entropy": 1.6207883477210998, "epoch": 1.8630678077682687, "grad_norm": 0.7990636229515076, "learning_rate": 2.5646262672642033e-07, "loss": 0.0169, "mean_token_accuracy": 0.9947772800922394, "num_tokens": 12068243.0, "step": 5660 }, { "entropy": 1.6112945556640625, "epoch": 1.8647136273864384, "grad_norm": 2.073958396911621, "learning_rate": 2.503749560057178e-07, "loss": 0.0185, "mean_token_accuracy": 0.9919489622116089, "num_tokens": 12079308.0, "step": 5665 }, { "entropy": 1.6480265259742737, "epoch": 1.8663594470046083, "grad_norm": 2.8769478797912598, "learning_rate": 2.4435949152906144e-07, "loss": 0.0235, "mean_token_accuracy": 0.9915726184844971, "num_tokens": 12090018.0, "step": 5670 }, { "entropy": 1.616892433166504, "epoch": 1.8680052666227782, "grad_norm": 0.648573100566864, "learning_rate": 2.3841627784730536e-07, "loss": 0.0174, "mean_token_accuracy": 0.993311470746994, "num_tokens": 12100850.0, "step": 5675 }, { "entropy": 1.6181252598762512, "epoch": 1.869651086240948, "grad_norm": 2.1162548065185547, "learning_rate": 2.325453589762061e-07, "loss": 0.0188, "mean_token_accuracy": 0.9941606819629669, "num_tokens": 12111601.0, "step": 5680 }, { "entropy": 1.6273619651794433, "epoch": 1.8712969058591178, "grad_norm": 3.485987901687622, "learning_rate": 2.2674677839610305e-07, "loss": 0.0351, "mean_token_accuracy": 0.9911395490169526, "num_tokens": 12122330.0, "step": 5685 }, { "entropy": 1.6523537158966064, "epoch": 1.8729427254772877, "grad_norm": 0.9486113786697388, "learning_rate": 2.2102057905159292e-07, "loss": 0.0188, "mean_token_accuracy": 0.9935419499874115, "num_tokens": 12132919.0, "step": 5690 }, { "entropy": 1.6415381669998168, "epoch": 1.8745885450954576, "grad_norm": 1.972302794456482, "learning_rate": 2.1536680335121684e-07, "loss": 0.0198, "mean_token_accuracy": 0.9924857258796692, "num_tokens": 12143394.0, "step": 5695 }, { "entropy": 1.6063251495361328, "epoch": 1.8762343647136275, "grad_norm": 0.8765217661857605, "learning_rate": 2.0978549316713615e-07, "loss": 0.0138, "mean_token_accuracy": 0.9943941414356232, "num_tokens": 12154201.0, "step": 5700 }, { "entropy": 1.6426220178604125, "epoch": 1.8778801843317972, "grad_norm": 1.477146029472351, "learning_rate": 2.0427668983483361e-07, "loss": 0.0221, "mean_token_accuracy": 0.9906249940395355, "num_tokens": 12164635.0, "step": 5705 }, { "entropy": 1.6509037852287292, "epoch": 1.879526003949967, "grad_norm": 1.7725666761398315, "learning_rate": 1.9884043415280274e-07, "loss": 0.0229, "mean_token_accuracy": 0.9909674286842346, "num_tokens": 12175320.0, "step": 5710 }, { "entropy": 1.649256443977356, "epoch": 1.881171823568137, "grad_norm": 1.5584267377853394, "learning_rate": 1.9347676638224122e-07, "loss": 0.0182, "mean_token_accuracy": 0.9936787724494934, "num_tokens": 12185966.0, "step": 5715 }, { "entropy": 1.6484339714050293, "epoch": 1.8828176431863066, "grad_norm": 0.69387286901474, "learning_rate": 1.8818572624676124e-07, "loss": 0.0179, "mean_token_accuracy": 0.9950119018554687, "num_tokens": 12196531.0, "step": 5720 }, { "entropy": 1.618002474308014, "epoch": 1.8844634628044767, "grad_norm": 3.0518853664398193, "learning_rate": 1.8296735293208745e-07, "loss": 0.0324, "mean_token_accuracy": 0.991709417104721, "num_tokens": 12207170.0, "step": 5725 }, { "entropy": 1.6460561990737914, "epoch": 1.8861092824226464, "grad_norm": 0.9351851940155029, "learning_rate": 1.7782168508577168e-07, "loss": 0.0178, "mean_token_accuracy": 0.9925060391426086, "num_tokens": 12217651.0, "step": 5730 }, { "entropy": 1.6722620725631714, "epoch": 1.8877551020408163, "grad_norm": 2.6632797718048096, "learning_rate": 1.7274876081690429e-07, "loss": 0.0423, "mean_token_accuracy": 0.9879862844944001, "num_tokens": 12227987.0, "step": 5735 }, { "entropy": 1.6359737396240235, "epoch": 1.8894009216589862, "grad_norm": 0.5745594501495361, "learning_rate": 1.6774861769583538e-07, "loss": 0.0291, "mean_token_accuracy": 0.9925860285758972, "num_tokens": 12238790.0, "step": 5740 }, { "entropy": 1.6011884927749633, "epoch": 1.8910467412771559, "grad_norm": 1.4916797876358032, "learning_rate": 1.628212927538908e-07, "loss": 0.0285, "mean_token_accuracy": 0.9890454232692718, "num_tokens": 12249683.0, "step": 5745 }, { "entropy": 1.6060463190078735, "epoch": 1.892692560895326, "grad_norm": 0.7883341312408447, "learning_rate": 1.5796682248310214e-07, "loss": 0.0156, "mean_token_accuracy": 0.9944408297538757, "num_tokens": 12260660.0, "step": 5750 }, { "entropy": 1.6521755695343017, "epoch": 1.8943383805134957, "grad_norm": 0.684786856174469, "learning_rate": 1.5318524283593706e-07, "loss": 0.0072, "mean_token_accuracy": 0.9973987996578216, "num_tokens": 12271559.0, "step": 5755 }, { "entropy": 1.6584128499031068, "epoch": 1.8959842001316656, "grad_norm": 3.097158432006836, "learning_rate": 1.484765892250284e-07, "loss": 0.0216, "mean_token_accuracy": 0.9943334817886352, "num_tokens": 12281996.0, "step": 5760 }, { "entropy": 1.6525746941566468, "epoch": 1.8976300197498355, "grad_norm": 4.315025329589844, "learning_rate": 1.4384089652291544e-07, "loss": 0.0293, "mean_token_accuracy": 0.9913449168205262, "num_tokens": 12292609.0, "step": 5765 }, { "entropy": 1.608083975315094, "epoch": 1.8992758393680051, "grad_norm": 0.8511555790901184, "learning_rate": 1.3927819906178864e-07, "loss": 0.0162, "mean_token_accuracy": 0.9940433919429779, "num_tokens": 12303345.0, "step": 5770 }, { "entropy": 1.654729962348938, "epoch": 1.9009216589861753, "grad_norm": 5.2224860191345215, "learning_rate": 1.3478853063322862e-07, "loss": 0.0203, "mean_token_accuracy": 0.9930156350135804, "num_tokens": 12313946.0, "step": 5775 }, { "entropy": 1.6443889737129211, "epoch": 1.902567478604345, "grad_norm": 0.4680110812187195, "learning_rate": 1.3037192448795754e-07, "loss": 0.0185, "mean_token_accuracy": 0.9930983424186707, "num_tokens": 12324639.0, "step": 5780 }, { "entropy": 1.634881365299225, "epoch": 1.9042132982225148, "grad_norm": 1.4533123970031738, "learning_rate": 1.2602841333559934e-07, "loss": 0.0181, "mean_token_accuracy": 0.995710015296936, "num_tokens": 12335427.0, "step": 5785 }, { "entropy": 1.6775338172912597, "epoch": 1.9058591178406847, "grad_norm": 1.9515726566314697, "learning_rate": 1.217580293444276e-07, "loss": 0.0208, "mean_token_accuracy": 0.9887213408946991, "num_tokens": 12345700.0, "step": 5790 }, { "entropy": 1.6608016729354858, "epoch": 1.9075049374588544, "grad_norm": 2.2838118076324463, "learning_rate": 1.1756080414113691e-07, "loss": 0.0221, "mean_token_accuracy": 0.9925664782524108, "num_tokens": 12356355.0, "step": 5795 }, { "entropy": 1.6454607725143433, "epoch": 1.9091507570770243, "grad_norm": 1.0771596431732178, "learning_rate": 1.1343676881059751e-07, "loss": 0.0106, "mean_token_accuracy": 0.9954189419746399, "num_tokens": 12366898.0, "step": 5800 }, { "entropy": 1.6636238932609557, "epoch": 1.9107965766951942, "grad_norm": 3.005011796951294, "learning_rate": 1.0938595389563988e-07, "loss": 0.0143, "mean_token_accuracy": 0.994761872291565, "num_tokens": 12377443.0, "step": 5805 }, { "entropy": 1.6605603098869324, "epoch": 1.912442396313364, "grad_norm": 2.230198621749878, "learning_rate": 1.0540838939681164e-07, "loss": 0.0242, "mean_token_accuracy": 0.9926084816455841, "num_tokens": 12388275.0, "step": 5810 }, { "entropy": 1.606773316860199, "epoch": 1.914088215931534, "grad_norm": 0.7968350052833557, "learning_rate": 1.0150410477216987e-07, "loss": 0.0105, "mean_token_accuracy": 0.9964967429637909, "num_tokens": 12399282.0, "step": 5815 }, { "entropy": 1.6078164100646972, "epoch": 1.9157340355497037, "grad_norm": 0.7457708716392517, "learning_rate": 9.767312893705583e-08, "loss": 0.0203, "mean_token_accuracy": 0.9911931037902832, "num_tokens": 12410043.0, "step": 5820 }, { "entropy": 1.6395422101020813, "epoch": 1.9173798551678736, "grad_norm": 0.5907423496246338, "learning_rate": 9.391549026387948e-08, "loss": 0.0238, "mean_token_accuracy": 0.9929281890392303, "num_tokens": 12420722.0, "step": 5825 }, { "entropy": 1.6461820960044862, "epoch": 1.9190256747860435, "grad_norm": 2.0906271934509277, "learning_rate": 9.023121658191636e-08, "loss": 0.0187, "mean_token_accuracy": 0.993992280960083, "num_tokens": 12431395.0, "step": 5830 }, { "entropy": 1.6568657994270324, "epoch": 1.9206714944042131, "grad_norm": 0.6122414469718933, "learning_rate": 8.662033517709113e-08, "loss": 0.0306, "mean_token_accuracy": 0.9899275124073028, "num_tokens": 12442077.0, "step": 5835 }, { "entropy": 1.6694289207458497, "epoch": 1.9223173140223833, "grad_norm": 0.9769027829170227, "learning_rate": 8.308287279178651e-08, "loss": 0.0214, "mean_token_accuracy": 0.9897110819816589, "num_tokens": 12452490.0, "step": 5840 }, { "entropy": 1.6428742289543152, "epoch": 1.923963133640553, "grad_norm": 1.7451066970825195, "learning_rate": 7.961885562463689e-08, "loss": 0.0205, "mean_token_accuracy": 0.993678605556488, "num_tokens": 12463113.0, "step": 5845 }, { "entropy": 1.6416840195655822, "epoch": 1.9256089532587228, "grad_norm": 0.20614351332187653, "learning_rate": 7.622830933033954e-08, "loss": 0.0159, "mean_token_accuracy": 0.9947439730167389, "num_tokens": 12473584.0, "step": 5850 }, { "entropy": 1.668267822265625, "epoch": 1.9272547728768927, "grad_norm": 1.4911457300186157, "learning_rate": 7.291125901946027e-08, "loss": 0.0293, "mean_token_accuracy": 0.9900563299655915, "num_tokens": 12483990.0, "step": 5855 }, { "entropy": 1.6057178854942322, "epoch": 1.9289005924950624, "grad_norm": 1.5931812524795532, "learning_rate": 6.966772925825149e-08, "loss": 0.0199, "mean_token_accuracy": 0.9930034399032592, "num_tokens": 12494711.0, "step": 5860 }, { "entropy": 1.6411123514175414, "epoch": 1.9305464121132325, "grad_norm": 2.541999578475952, "learning_rate": 6.649774406846777e-08, "loss": 0.0191, "mean_token_accuracy": 0.9947657942771911, "num_tokens": 12505189.0, "step": 5865 }, { "entropy": 1.6578918814659118, "epoch": 1.9321922317314022, "grad_norm": 1.0876408815383911, "learning_rate": 6.340132692718936e-08, "loss": 0.0143, "mean_token_accuracy": 0.994833791255951, "num_tokens": 12515680.0, "step": 5870 }, { "entropy": 1.620399296283722, "epoch": 1.933838051349572, "grad_norm": 1.0122355222702026, "learning_rate": 6.037850076664686e-08, "loss": 0.02, "mean_token_accuracy": 0.9926087379455566, "num_tokens": 12526219.0, "step": 5875 }, { "entropy": 1.619743776321411, "epoch": 1.935483870967742, "grad_norm": 2.4715607166290283, "learning_rate": 5.742928797405234e-08, "loss": 0.027, "mean_token_accuracy": 0.9934347212314606, "num_tokens": 12536968.0, "step": 5880 }, { "entropy": 1.6266910910606385, "epoch": 1.9371296905859117, "grad_norm": 1.2927361726760864, "learning_rate": 5.455371039143176e-08, "loss": 0.0296, "mean_token_accuracy": 0.9922801196575165, "num_tokens": 12547695.0, "step": 5885 }, { "entropy": 1.6311318516731261, "epoch": 1.9387755102040818, "grad_norm": 1.5303946733474731, "learning_rate": 5.175178931546842e-08, "loss": 0.0256, "mean_token_accuracy": 0.9924924194812774, "num_tokens": 12558345.0, "step": 5890 }, { "entropy": 1.6298449873924254, "epoch": 1.9404213298222515, "grad_norm": 0.6128053069114685, "learning_rate": 4.902354549733979e-08, "loss": 0.0268, "mean_token_accuracy": 0.9905460000038147, "num_tokens": 12568986.0, "step": 5895 }, { "entropy": 1.633393156528473, "epoch": 1.9420671494404214, "grad_norm": 1.1418019533157349, "learning_rate": 4.636899914256421e-08, "loss": 0.0184, "mean_token_accuracy": 0.9936664879322052, "num_tokens": 12579759.0, "step": 5900 }, { "entropy": 1.63808730840683, "epoch": 1.9437129690585913, "grad_norm": 2.710805654525757, "learning_rate": 4.378816991085333e-08, "loss": 0.0198, "mean_token_accuracy": 0.9934788465499877, "num_tokens": 12590306.0, "step": 5905 }, { "entropy": 1.6379319310188294, "epoch": 1.945358788676761, "grad_norm": 1.7254809141159058, "learning_rate": 4.128107691596772e-08, "loss": 0.0228, "mean_token_accuracy": 0.9918417274951935, "num_tokens": 12600767.0, "step": 5910 }, { "entropy": 1.6477123498916626, "epoch": 1.9470046082949308, "grad_norm": 0.8686738014221191, "learning_rate": 3.884773872557035e-08, "loss": 0.0285, "mean_token_accuracy": 0.989762258529663, "num_tokens": 12611296.0, "step": 5915 }, { "entropy": 1.6127866506576538, "epoch": 1.9486504279131007, "grad_norm": 1.5231475830078125, "learning_rate": 3.648817336109556e-08, "loss": 0.0185, "mean_token_accuracy": 0.9937713503837585, "num_tokens": 12622049.0, "step": 5920 }, { "entropy": 1.6393255352973939, "epoch": 1.9502962475312706, "grad_norm": 1.2301123142242432, "learning_rate": 3.420239829761029e-08, "loss": 0.0169, "mean_token_accuracy": 0.9921809434890747, "num_tokens": 12632714.0, "step": 5925 }, { "entropy": 1.661654818058014, "epoch": 1.9519420671494405, "grad_norm": 0.7805505990982056, "learning_rate": 3.199043046368644e-08, "loss": 0.0181, "mean_token_accuracy": 0.9945761322975158, "num_tokens": 12643222.0, "step": 5930 }, { "entropy": 1.6112356901168823, "epoch": 1.9535878867676102, "grad_norm": 1.137802004814148, "learning_rate": 2.985228624127534e-08, "loss": 0.0189, "mean_token_accuracy": 0.9933175027370453, "num_tokens": 12654137.0, "step": 5935 }, { "entropy": 1.6136484742164612, "epoch": 1.95523370638578, "grad_norm": 1.2796485424041748, "learning_rate": 2.778798146558903e-08, "loss": 0.0161, "mean_token_accuracy": 0.9944588780403137, "num_tokens": 12664918.0, "step": 5940 }, { "entropy": 1.615454363822937, "epoch": 1.95687952600395, "grad_norm": 1.3428839445114136, "learning_rate": 2.5797531424976983e-08, "loss": 0.0275, "mean_token_accuracy": 0.993310171365738, "num_tokens": 12675751.0, "step": 5945 }, { "entropy": 1.63991322517395, "epoch": 1.9585253456221197, "grad_norm": 1.0178812742233276, "learning_rate": 2.388095086081954e-08, "loss": 0.0189, "mean_token_accuracy": 0.9943699061870575, "num_tokens": 12686462.0, "step": 5950 }, { "entropy": 1.6252501964569093, "epoch": 1.9601711652402898, "grad_norm": 0.5898308157920837, "learning_rate": 2.2038253967415768e-08, "loss": 0.0262, "mean_token_accuracy": 0.9889247179031372, "num_tokens": 12697111.0, "step": 5955 }, { "entropy": 1.6657869219779968, "epoch": 1.9618169848584595, "grad_norm": 1.152711033821106, "learning_rate": 2.0269454391874665e-08, "loss": 0.0195, "mean_token_accuracy": 0.9946586787700653, "num_tokens": 12707827.0, "step": 5960 }, { "entropy": 1.6191688776016235, "epoch": 1.9634628044766294, "grad_norm": 1.1193846464157104, "learning_rate": 1.8574565234023014e-08, "loss": 0.0101, "mean_token_accuracy": 0.9955754101276397, "num_tokens": 12718563.0, "step": 5965 }, { "entropy": 1.603171467781067, "epoch": 1.9651086240947993, "grad_norm": 1.5283174514770508, "learning_rate": 1.6953599046299895e-08, "loss": 0.0175, "mean_token_accuracy": 0.9934708416461945, "num_tokens": 12729286.0, "step": 5970 }, { "entropy": 1.6272645711898803, "epoch": 1.966754443712969, "grad_norm": 0.8465014100074768, "learning_rate": 1.5406567833666785e-08, "loss": 0.0151, "mean_token_accuracy": 0.9951033234596253, "num_tokens": 12739732.0, "step": 5975 }, { "entropy": 1.644085705280304, "epoch": 1.968400263331139, "grad_norm": 0.6121142506599426, "learning_rate": 1.3933483053519825e-08, "loss": 0.0142, "mean_token_accuracy": 0.9949011683464051, "num_tokens": 12750492.0, "step": 5980 }, { "entropy": 1.6154101610183715, "epoch": 1.9700460829493087, "grad_norm": 2.3735482692718506, "learning_rate": 1.2534355615603233e-08, "loss": 0.033, "mean_token_accuracy": 0.9925001919269562, "num_tokens": 12761291.0, "step": 5985 }, { "entropy": 1.6425586223602295, "epoch": 1.9716919025674786, "grad_norm": 2.7295331954956055, "learning_rate": 1.1209195881930479e-08, "loss": 0.0247, "mean_token_accuracy": 0.9938769578933716, "num_tokens": 12771987.0, "step": 5990 }, { "entropy": 1.5966406106948852, "epoch": 1.9733377221856485, "grad_norm": 2.0931482315063477, "learning_rate": 9.958013666704347e-09, "loss": 0.0198, "mean_token_accuracy": 0.9945861279964447, "num_tokens": 12782717.0, "step": 5995 }, { "entropy": 1.6259140491485595, "epoch": 1.9749835418038182, "grad_norm": 0.6469723582267761, "learning_rate": 8.780818236248101e-09, "loss": 0.0242, "mean_token_accuracy": 0.9942210376262665, "num_tokens": 12793255.0, "step": 6000 }, { "entropy": 1.6549716949462892, "epoch": 1.9766293614219883, "grad_norm": 0.901390790939331, "learning_rate": 7.67761830893443e-09, "loss": 0.0137, "mean_token_accuracy": 0.9956705689430236, "num_tokens": 12804142.0, "step": 6005 }, { "entropy": 1.6143463373184204, "epoch": 1.978275181040158, "grad_norm": 1.2343865633010864, "learning_rate": 6.648422055118842e-09, "loss": 0.0127, "mean_token_accuracy": 0.9951093852519989, "num_tokens": 12815008.0, "step": 6010 }, { "entropy": 1.6510612845420838, "epoch": 1.9799210006583279, "grad_norm": 2.802551031112671, "learning_rate": 5.693237097085247e-09, "loss": 0.0162, "mean_token_accuracy": 0.9935081660747528, "num_tokens": 12825498.0, "step": 6015 }, { "entropy": 1.6498525738716125, "epoch": 1.9815668202764978, "grad_norm": 0.6338088512420654, "learning_rate": 4.8120705089849116e-09, "loss": 0.0147, "mean_token_accuracy": 0.9955414116382599, "num_tokens": 12835988.0, "step": 6020 }, { "entropy": 1.6122831344604491, "epoch": 1.9832126398946675, "grad_norm": 2.181898355484009, "learning_rate": 4.00492881678427e-09, "loss": 0.0294, "mean_token_accuracy": 0.9928855717182159, "num_tokens": 12846863.0, "step": 6025 }, { "entropy": 1.638089156150818, "epoch": 1.9848584595128373, "grad_norm": 2.5133092403411865, "learning_rate": 3.271817998216076e-09, "loss": 0.025, "mean_token_accuracy": 0.9939551532268525, "num_tokens": 12857465.0, "step": 6030 }, { "entropy": 1.5842849373817445, "epoch": 1.9865042791310072, "grad_norm": 3.628753662109375, "learning_rate": 2.612743482741653e-09, "loss": 0.0359, "mean_token_accuracy": 0.9892144203186035, "num_tokens": 12868438.0, "step": 6035 }, { "entropy": 1.676398503780365, "epoch": 1.9881500987491771, "grad_norm": 2.431900978088379, "learning_rate": 2.0277101514987184e-09, "loss": 0.0276, "mean_token_accuracy": 0.991285365819931, "num_tokens": 12878820.0, "step": 6040 }, { "entropy": 1.6270844221115113, "epoch": 1.989795918367347, "grad_norm": 3.490521192550659, "learning_rate": 1.5167223372780648e-09, "loss": 0.0291, "mean_token_accuracy": 0.9905443787574768, "num_tokens": 12889541.0, "step": 6045 }, { "entropy": 1.595809280872345, "epoch": 1.9914417379855167, "grad_norm": 2.535038709640503, "learning_rate": 1.0797838244802627e-09, "loss": 0.022, "mean_token_accuracy": 0.9905419945716858, "num_tokens": 12900791.0, "step": 6050 }, { "entropy": 1.594471561908722, "epoch": 1.9930875576036866, "grad_norm": 1.6428083181381226, "learning_rate": 7.168978490978973e-10, "loss": 0.0099, "mean_token_accuracy": 0.9953214168548584, "num_tokens": 12911687.0, "step": 6055 }, { "entropy": 1.6622145771980286, "epoch": 1.9947333772218565, "grad_norm": 1.0861161947250366, "learning_rate": 4.2806709868115084e-10, "loss": 0.0256, "mean_token_accuracy": 0.9900858104228973, "num_tokens": 12922176.0, "step": 6060 }, { "entropy": 1.616791796684265, "epoch": 1.9963791968400262, "grad_norm": 2.0424978733062744, "learning_rate": 2.1329371232892138e-10, "loss": 0.0261, "mean_token_accuracy": 0.9941427707672119, "num_tokens": 12932662.0, "step": 6065 }, { "entropy": 1.6235838294029237, "epoch": 1.9980250164581963, "grad_norm": 0.6039044260978699, "learning_rate": 7.257928066217723e-11, "loss": 0.0321, "mean_token_accuracy": 0.9933849751949311, "num_tokens": 12943359.0, "step": 6070 }, { "entropy": 1.6359187722206117, "epoch": 1.999670836076366, "grad_norm": 3.8299636840820312, "learning_rate": 5.924845819516023e-12, "loss": 0.0217, "mean_token_accuracy": 0.9923797845840454, "num_tokens": 12954081.0, "step": 6075 } ], "logging_steps": 5, "max_steps": 6076, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 608, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.285029236308019e+16, "train_batch_size": 6, "trial_name": null, "trial_params": null }