{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.719208870242733, "eval_steps": 500, "global_step": 900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.3359488248825073, "epoch": 0.0079912096693637, "grad_norm": 13.5, "learning_rate": 0.00018, "loss": 1.1391, "mean_token_accuracy": 0.7321531124413013, "num_tokens": 17863.0, "step": 10 }, { "entropy": 0.8153514951467514, "epoch": 0.0159824193387274, "grad_norm": 0.357421875, "learning_rate": 0.00019855072463768116, "loss": 0.735, "mean_token_accuracy": 0.806717099994421, "num_tokens": 35538.0, "step": 20 }, { "entropy": 0.7226301684975625, "epoch": 0.0239736290080911, "grad_norm": 0.27734375, "learning_rate": 0.00019694041867954914, "loss": 0.6906, "mean_token_accuracy": 0.8121421404182911, "num_tokens": 52898.0, "step": 30 }, { "entropy": 0.7096233293414116, "epoch": 0.0319648386774548, "grad_norm": 0.408203125, "learning_rate": 0.00019533011272141707, "loss": 0.6794, "mean_token_accuracy": 0.8093454599380493, "num_tokens": 71120.0, "step": 40 }, { "entropy": 0.6432753155007959, "epoch": 0.0399560483468185, "grad_norm": 0.29296875, "learning_rate": 0.00019371980676328502, "loss": 0.5778, "mean_token_accuracy": 0.8227488771080971, "num_tokens": 89183.0, "step": 50 }, { "entropy": 0.6528786711394787, "epoch": 0.0479472580161822, "grad_norm": 0.19921875, "learning_rate": 0.000192109500805153, "loss": 0.6056, "mean_token_accuracy": 0.8218209922313691, "num_tokens": 108013.0, "step": 60 }, { "entropy": 0.6535129006952047, "epoch": 0.055938467685545896, "grad_norm": 0.2236328125, "learning_rate": 0.00019049919484702096, "loss": 0.5954, "mean_token_accuracy": 0.8255642831325531, "num_tokens": 126649.0, "step": 70 }, { "entropy": 0.6079864472150802, "epoch": 0.0639296773549096, "grad_norm": 0.2001953125, "learning_rate": 0.00018888888888888888, "loss": 0.5911, "mean_token_accuracy": 0.8231404431164264, "num_tokens": 144182.0, "step": 80 }, { "entropy": 0.6695162910968065, "epoch": 0.0719208870242733, "grad_norm": 0.1962890625, "learning_rate": 0.00018727858293075687, "loss": 0.6312, "mean_token_accuracy": 0.8185268670320511, "num_tokens": 163634.0, "step": 90 }, { "entropy": 0.6382982328534126, "epoch": 0.079912096693637, "grad_norm": 0.2109375, "learning_rate": 0.00018566827697262482, "loss": 0.5965, "mean_token_accuracy": 0.8219615206122398, "num_tokens": 182899.0, "step": 100 }, { "entropy": 0.615644377656281, "epoch": 0.0879033063630007, "grad_norm": 0.17578125, "learning_rate": 0.00018405797101449275, "loss": 0.555, "mean_token_accuracy": 0.8339135125279427, "num_tokens": 201540.0, "step": 110 }, { "entropy": 0.6227292243391276, "epoch": 0.0958945160323644, "grad_norm": 0.2392578125, "learning_rate": 0.00018244766505636073, "loss": 0.5958, "mean_token_accuracy": 0.822028624266386, "num_tokens": 220163.0, "step": 120 }, { "entropy": 0.6280987083911895, "epoch": 0.1038857257017281, "grad_norm": 0.2177734375, "learning_rate": 0.00018083735909822868, "loss": 0.5827, "mean_token_accuracy": 0.8300345286726951, "num_tokens": 237989.0, "step": 130 }, { "entropy": 0.6323467470705509, "epoch": 0.11187693537109179, "grad_norm": 0.154296875, "learning_rate": 0.00017922705314009664, "loss": 0.5986, "mean_token_accuracy": 0.8197977431118488, "num_tokens": 256250.0, "step": 140 }, { "entropy": 0.6162901744246483, "epoch": 0.1198681450404555, "grad_norm": 0.1552734375, "learning_rate": 0.00017761674718196456, "loss": 0.5769, "mean_token_accuracy": 0.8255695670843124, "num_tokens": 274386.0, "step": 150 }, { "entropy": 0.6158512964844703, "epoch": 0.1278593547098192, "grad_norm": 0.203125, "learning_rate": 0.00017600644122383254, "loss": 0.5713, "mean_token_accuracy": 0.8270042009651661, "num_tokens": 292835.0, "step": 160 }, { "entropy": 0.6406407546252012, "epoch": 0.1358505643791829, "grad_norm": 0.2119140625, "learning_rate": 0.0001743961352657005, "loss": 0.6104, "mean_token_accuracy": 0.8200316116213798, "num_tokens": 310392.0, "step": 170 }, { "entropy": 0.6069310043007136, "epoch": 0.1438417740485466, "grad_norm": 0.1640625, "learning_rate": 0.00017278582930756842, "loss": 0.5624, "mean_token_accuracy": 0.8317995510995388, "num_tokens": 331305.0, "step": 180 }, { "entropy": 0.6235411781817675, "epoch": 0.1518329837179103, "grad_norm": 0.1640625, "learning_rate": 0.0001711755233494364, "loss": 0.5852, "mean_token_accuracy": 0.8278815999627114, "num_tokens": 349604.0, "step": 190 }, { "entropy": 0.6172796195372939, "epoch": 0.159824193387274, "grad_norm": 0.1669921875, "learning_rate": 0.00016956521739130436, "loss": 0.5764, "mean_token_accuracy": 0.8313593462109565, "num_tokens": 367876.0, "step": 200 }, { "entropy": 0.6155283484607935, "epoch": 0.1678154030566377, "grad_norm": 0.1865234375, "learning_rate": 0.00016795491143317231, "loss": 0.5773, "mean_token_accuracy": 0.8269082359969616, "num_tokens": 385573.0, "step": 210 }, { "entropy": 0.6083606427535415, "epoch": 0.1758066127260014, "grad_norm": 0.154296875, "learning_rate": 0.00016634460547504027, "loss": 0.5704, "mean_token_accuracy": 0.8263973362743855, "num_tokens": 404851.0, "step": 220 }, { "entropy": 0.6131238225847483, "epoch": 0.1837978223953651, "grad_norm": 0.20703125, "learning_rate": 0.00016473429951690822, "loss": 0.5817, "mean_token_accuracy": 0.8245558224618434, "num_tokens": 422809.0, "step": 230 }, { "entropy": 0.6223553754389286, "epoch": 0.1917890320647288, "grad_norm": 0.234375, "learning_rate": 0.00016312399355877618, "loss": 0.5871, "mean_token_accuracy": 0.8232570059597493, "num_tokens": 439086.0, "step": 240 }, { "entropy": 0.6230555597692728, "epoch": 0.1997802417340925, "grad_norm": 0.171875, "learning_rate": 0.00016151368760064413, "loss": 0.5751, "mean_token_accuracy": 0.8289580881595612, "num_tokens": 457157.0, "step": 250 }, { "entropy": 0.5794363841414452, "epoch": 0.2077714514034562, "grad_norm": 0.2294921875, "learning_rate": 0.00015990338164251208, "loss": 0.5627, "mean_token_accuracy": 0.8365659207105637, "num_tokens": 474701.0, "step": 260 }, { "entropy": 0.5860258772969246, "epoch": 0.2157626610728199, "grad_norm": 0.1484375, "learning_rate": 0.00015829307568438004, "loss": 0.5363, "mean_token_accuracy": 0.8401569269597531, "num_tokens": 495405.0, "step": 270 }, { "entropy": 0.581408916413784, "epoch": 0.22375387074218359, "grad_norm": 0.205078125, "learning_rate": 0.000156682769726248, "loss": 0.5593, "mean_token_accuracy": 0.8319723285734654, "num_tokens": 512629.0, "step": 280 }, { "entropy": 0.5774631313979626, "epoch": 0.2317450804115473, "grad_norm": 0.171875, "learning_rate": 0.00015507246376811595, "loss": 0.5445, "mean_token_accuracy": 0.8400227598845958, "num_tokens": 531652.0, "step": 290 }, { "entropy": 0.5888700131326914, "epoch": 0.239736290080911, "grad_norm": 0.1884765625, "learning_rate": 0.0001534621578099839, "loss": 0.5475, "mean_token_accuracy": 0.8398719631135464, "num_tokens": 551807.0, "step": 300 }, { "entropy": 0.6303706657141447, "epoch": 0.2477274997502747, "grad_norm": 0.185546875, "learning_rate": 0.00015185185185185185, "loss": 0.5994, "mean_token_accuracy": 0.8221785329282284, "num_tokens": 570173.0, "step": 310 }, { "entropy": 0.591961058229208, "epoch": 0.2557187094196384, "grad_norm": 0.1708984375, "learning_rate": 0.0001502415458937198, "loss": 0.5588, "mean_token_accuracy": 0.8339079335331917, "num_tokens": 587517.0, "step": 320 }, { "entropy": 0.6203833676874637, "epoch": 0.2637099190890021, "grad_norm": 0.158203125, "learning_rate": 0.00014863123993558776, "loss": 0.5993, "mean_token_accuracy": 0.8254540674388409, "num_tokens": 605533.0, "step": 330 }, { "entropy": 0.5948904637247324, "epoch": 0.2717011287583658, "grad_norm": 0.1689453125, "learning_rate": 0.00014702093397745574, "loss": 0.5386, "mean_token_accuracy": 0.835470549017191, "num_tokens": 623145.0, "step": 340 }, { "entropy": 0.5892129261046648, "epoch": 0.2796923384277295, "grad_norm": 0.2041015625, "learning_rate": 0.00014541062801932367, "loss": 0.5445, "mean_token_accuracy": 0.8327487081289291, "num_tokens": 642429.0, "step": 350 }, { "entropy": 0.58230458535254, "epoch": 0.2876835480970932, "grad_norm": 0.1748046875, "learning_rate": 0.00014380032206119162, "loss": 0.5458, "mean_token_accuracy": 0.8369288526475429, "num_tokens": 660595.0, "step": 360 }, { "entropy": 0.5953514769673347, "epoch": 0.2956747577664569, "grad_norm": 0.1494140625, "learning_rate": 0.0001421900161030596, "loss": 0.5564, "mean_token_accuracy": 0.8314083501696586, "num_tokens": 680301.0, "step": 370 }, { "entropy": 0.6272314839065075, "epoch": 0.3036659674358206, "grad_norm": 0.189453125, "learning_rate": 0.00014057971014492753, "loss": 0.5879, "mean_token_accuracy": 0.8269149273633957, "num_tokens": 698836.0, "step": 380 }, { "entropy": 0.5974850662052631, "epoch": 0.3116571771051843, "grad_norm": 0.1875, "learning_rate": 0.0001389694041867955, "loss": 0.5567, "mean_token_accuracy": 0.8330555327236653, "num_tokens": 717301.0, "step": 390 }, { "entropy": 0.610439121723175, "epoch": 0.319648386774548, "grad_norm": 0.1943359375, "learning_rate": 0.00013735909822866347, "loss": 0.5798, "mean_token_accuracy": 0.8273908801376819, "num_tokens": 735623.0, "step": 400 }, { "entropy": 0.6218720726668835, "epoch": 0.3276395964439117, "grad_norm": 0.1689453125, "learning_rate": 0.00013574879227053142, "loss": 0.5681, "mean_token_accuracy": 0.8264160886406898, "num_tokens": 754095.0, "step": 410 }, { "entropy": 0.5961160399019718, "epoch": 0.3356308061132754, "grad_norm": 0.130859375, "learning_rate": 0.00013413848631239935, "loss": 0.5649, "mean_token_accuracy": 0.8278753645718098, "num_tokens": 772932.0, "step": 420 }, { "entropy": 0.5970222994685173, "epoch": 0.3436220157826391, "grad_norm": 0.1552734375, "learning_rate": 0.0001325281803542673, "loss": 0.5717, "mean_token_accuracy": 0.8289826177060604, "num_tokens": 791954.0, "step": 430 }, { "entropy": 0.5869237255305052, "epoch": 0.3516132254520028, "grad_norm": 0.23828125, "learning_rate": 0.00013091787439613528, "loss": 0.5424, "mean_token_accuracy": 0.8353226915001869, "num_tokens": 810372.0, "step": 440 }, { "entropy": 0.6047268303111195, "epoch": 0.3596044351213665, "grad_norm": 0.16015625, "learning_rate": 0.0001293075684380032, "loss": 0.5816, "mean_token_accuracy": 0.8292522899806499, "num_tokens": 828031.0, "step": 450 }, { "entropy": 0.6398113902658225, "epoch": 0.3675956447907302, "grad_norm": 0.193359375, "learning_rate": 0.00012769726247987117, "loss": 0.587, "mean_token_accuracy": 0.8254243724048138, "num_tokens": 844965.0, "step": 460 }, { "entropy": 0.5699722157791257, "epoch": 0.3755868544600939, "grad_norm": 0.150390625, "learning_rate": 0.00012608695652173915, "loss": 0.5302, "mean_token_accuracy": 0.8379433415830135, "num_tokens": 864068.0, "step": 470 }, { "entropy": 0.6004057168960572, "epoch": 0.3835780641294576, "grad_norm": 0.1689453125, "learning_rate": 0.0001244766505636071, "loss": 0.5735, "mean_token_accuracy": 0.8319472163915634, "num_tokens": 882516.0, "step": 480 }, { "entropy": 0.612379564717412, "epoch": 0.3915692737988213, "grad_norm": 0.17578125, "learning_rate": 0.00012286634460547503, "loss": 0.5605, "mean_token_accuracy": 0.8312513306736946, "num_tokens": 901332.0, "step": 490 }, { "entropy": 0.5999802689999342, "epoch": 0.399560483468185, "grad_norm": 0.2236328125, "learning_rate": 0.00012125603864734301, "loss": 0.5844, "mean_token_accuracy": 0.8295043386518955, "num_tokens": 918902.0, "step": 500 }, { "entropy": 0.6438330963253975, "epoch": 0.4075516931375487, "grad_norm": 0.181640625, "learning_rate": 0.00011964573268921095, "loss": 0.6039, "mean_token_accuracy": 0.8217731453478336, "num_tokens": 937381.0, "step": 510 }, { "entropy": 0.557589478418231, "epoch": 0.4155429028069124, "grad_norm": 0.1748046875, "learning_rate": 0.0001180354267310789, "loss": 0.5347, "mean_token_accuracy": 0.8419624336063862, "num_tokens": 956408.0, "step": 520 }, { "entropy": 0.5831106752157211, "epoch": 0.4235341124762761, "grad_norm": 0.15625, "learning_rate": 0.00011642512077294687, "loss": 0.5566, "mean_token_accuracy": 0.8344054028391839, "num_tokens": 974727.0, "step": 530 }, { "entropy": 0.6096597962081433, "epoch": 0.4315253221456398, "grad_norm": 0.16015625, "learning_rate": 0.00011481481481481482, "loss": 0.5906, "mean_token_accuracy": 0.8249844819307327, "num_tokens": 992039.0, "step": 540 }, { "entropy": 0.6296380385756493, "epoch": 0.4395165318150035, "grad_norm": 0.185546875, "learning_rate": 0.00011320450885668277, "loss": 0.5774, "mean_token_accuracy": 0.8290597923099995, "num_tokens": 1010746.0, "step": 550 }, { "entropy": 0.5989726323634386, "epoch": 0.44750774148436717, "grad_norm": 0.1552734375, "learning_rate": 0.00011159420289855073, "loss": 0.5668, "mean_token_accuracy": 0.8317835494875908, "num_tokens": 1029302.0, "step": 560 }, { "entropy": 0.5985535632818937, "epoch": 0.4554989511537309, "grad_norm": 0.1533203125, "learning_rate": 0.00010998389694041869, "loss": 0.5927, "mean_token_accuracy": 0.8251331336796284, "num_tokens": 1047787.0, "step": 570 }, { "entropy": 0.5919383157044649, "epoch": 0.4634901608230946, "grad_norm": 0.140625, "learning_rate": 0.00010837359098228663, "loss": 0.5584, "mean_token_accuracy": 0.8338681124150753, "num_tokens": 1067471.0, "step": 580 }, { "entropy": 0.5655450899153948, "epoch": 0.4714813704924583, "grad_norm": 0.146484375, "learning_rate": 0.00010676328502415461, "loss": 0.5343, "mean_token_accuracy": 0.8383485890924931, "num_tokens": 1086046.0, "step": 590 }, { "entropy": 0.616737426072359, "epoch": 0.479472580161822, "grad_norm": 0.173828125, "learning_rate": 0.00010515297906602255, "loss": 0.5908, "mean_token_accuracy": 0.8193590499460697, "num_tokens": 1103227.0, "step": 600 }, { "entropy": 0.5877503883093596, "epoch": 0.4874637898311857, "grad_norm": 0.16796875, "learning_rate": 0.0001035426731078905, "loss": 0.5505, "mean_token_accuracy": 0.8355500593781471, "num_tokens": 1121994.0, "step": 610 }, { "entropy": 0.5908785469830036, "epoch": 0.4954549995005494, "grad_norm": 0.2255859375, "learning_rate": 0.00010193236714975847, "loss": 0.5794, "mean_token_accuracy": 0.8315194040536881, "num_tokens": 1139965.0, "step": 620 }, { "entropy": 0.6211531057953834, "epoch": 0.5034462091699131, "grad_norm": 0.134765625, "learning_rate": 0.00010032206119162641, "loss": 0.5664, "mean_token_accuracy": 0.8258850328624249, "num_tokens": 1159075.0, "step": 630 }, { "entropy": 0.5950863931328059, "epoch": 0.5114374188392768, "grad_norm": 0.1630859375, "learning_rate": 9.871175523349438e-05, "loss": 0.5497, "mean_token_accuracy": 0.8346662126481533, "num_tokens": 1176494.0, "step": 640 }, { "entropy": 0.5760080838575959, "epoch": 0.5194286285086405, "grad_norm": 0.23828125, "learning_rate": 9.710144927536232e-05, "loss": 0.5632, "mean_token_accuracy": 0.8364547491073608, "num_tokens": 1195371.0, "step": 650 }, { "entropy": 0.5897768154740334, "epoch": 0.5274198381780042, "grad_norm": 0.150390625, "learning_rate": 9.549114331723029e-05, "loss": 0.5611, "mean_token_accuracy": 0.8353584706783295, "num_tokens": 1215474.0, "step": 660 }, { "entropy": 0.6259935267269612, "epoch": 0.5354110478473679, "grad_norm": 0.19921875, "learning_rate": 9.388083735909823e-05, "loss": 0.5834, "mean_token_accuracy": 0.8251566261053085, "num_tokens": 1233066.0, "step": 670 }, { "entropy": 0.5858545243740082, "epoch": 0.5434022575167315, "grad_norm": 0.2080078125, "learning_rate": 9.227053140096618e-05, "loss": 0.5709, "mean_token_accuracy": 0.8273489251732826, "num_tokens": 1249355.0, "step": 680 }, { "entropy": 0.6020776845514775, "epoch": 0.5513934671860953, "grad_norm": 0.171875, "learning_rate": 9.066022544283415e-05, "loss": 0.5657, "mean_token_accuracy": 0.8260138787329196, "num_tokens": 1267301.0, "step": 690 }, { "entropy": 0.596510236337781, "epoch": 0.559384676855459, "grad_norm": 0.154296875, "learning_rate": 8.904991948470209e-05, "loss": 0.5557, "mean_token_accuracy": 0.834468311816454, "num_tokens": 1285467.0, "step": 700 }, { "entropy": 0.5882966015487909, "epoch": 0.5673758865248227, "grad_norm": 0.166015625, "learning_rate": 8.743961352657006e-05, "loss": 0.5423, "mean_token_accuracy": 0.8352835536003113, "num_tokens": 1304357.0, "step": 710 }, { "entropy": 0.6191790480166673, "epoch": 0.5753670961941864, "grad_norm": 0.1533203125, "learning_rate": 8.582930756843801e-05, "loss": 0.5759, "mean_token_accuracy": 0.8274984866380691, "num_tokens": 1321602.0, "step": 720 }, { "entropy": 0.6024752855300903, "epoch": 0.5833583058635501, "grad_norm": 0.2001953125, "learning_rate": 8.421900161030597e-05, "loss": 0.5638, "mean_token_accuracy": 0.8288030169904232, "num_tokens": 1340012.0, "step": 730 }, { "entropy": 0.5633068412542344, "epoch": 0.5913495155329138, "grad_norm": 0.1796875, "learning_rate": 8.260869565217392e-05, "loss": 0.5262, "mean_token_accuracy": 0.8409125037491322, "num_tokens": 1358549.0, "step": 740 }, { "entropy": 0.6121923718601465, "epoch": 0.5993407252022775, "grad_norm": 0.1640625, "learning_rate": 8.099838969404187e-05, "loss": 0.5782, "mean_token_accuracy": 0.8247248627245426, "num_tokens": 1376295.0, "step": 750 }, { "entropy": 0.5850671246647835, "epoch": 0.6073319348716412, "grad_norm": 0.12255859375, "learning_rate": 7.938808373590983e-05, "loss": 0.5481, "mean_token_accuracy": 0.8378213487565518, "num_tokens": 1396825.0, "step": 760 }, { "entropy": 0.6004991352558136, "epoch": 0.6153231445410049, "grad_norm": 0.1484375, "learning_rate": 7.777777777777778e-05, "loss": 0.5697, "mean_token_accuracy": 0.8314851686358452, "num_tokens": 1415779.0, "step": 770 }, { "entropy": 0.615888693742454, "epoch": 0.6233143542103686, "grad_norm": 0.1494140625, "learning_rate": 7.616747181964574e-05, "loss": 0.586, "mean_token_accuracy": 0.8290720954537392, "num_tokens": 1433844.0, "step": 780 }, { "entropy": 0.631605738401413, "epoch": 0.6313055638797322, "grad_norm": 0.1787109375, "learning_rate": 7.455716586151369e-05, "loss": 0.5896, "mean_token_accuracy": 0.823421498388052, "num_tokens": 1452173.0, "step": 790 }, { "entropy": 0.5806491080671549, "epoch": 0.639296773549096, "grad_norm": 0.1767578125, "learning_rate": 7.294685990338164e-05, "loss": 0.5541, "mean_token_accuracy": 0.8376387834548951, "num_tokens": 1469089.0, "step": 800 }, { "entropy": 0.5787392556667328, "epoch": 0.6472879832184597, "grad_norm": 0.2734375, "learning_rate": 7.13365539452496e-05, "loss": 0.5344, "mean_token_accuracy": 0.832699004560709, "num_tokens": 1488541.0, "step": 810 }, { "entropy": 0.5935858219861985, "epoch": 0.6552791928878234, "grad_norm": 0.1435546875, "learning_rate": 6.972624798711755e-05, "loss": 0.549, "mean_token_accuracy": 0.8358408592641353, "num_tokens": 1506966.0, "step": 820 }, { "entropy": 0.6146302495151759, "epoch": 0.663270402557187, "grad_norm": 0.15234375, "learning_rate": 6.811594202898552e-05, "loss": 0.5794, "mean_token_accuracy": 0.8260477609932423, "num_tokens": 1526428.0, "step": 830 }, { "entropy": 0.6146373618394136, "epoch": 0.6712616122265508, "grad_norm": 0.169921875, "learning_rate": 6.650563607085346e-05, "loss": 0.5917, "mean_token_accuracy": 0.8253330059349537, "num_tokens": 1543859.0, "step": 840 }, { "entropy": 0.5822377149015665, "epoch": 0.6792528218959145, "grad_norm": 0.1962890625, "learning_rate": 6.489533011272141e-05, "loss": 0.5561, "mean_token_accuracy": 0.8367624327540397, "num_tokens": 1562062.0, "step": 850 }, { "entropy": 0.5773209661245347, "epoch": 0.6872440315652782, "grad_norm": 0.1650390625, "learning_rate": 6.328502415458938e-05, "loss": 0.5144, "mean_token_accuracy": 0.83621421828866, "num_tokens": 1580374.0, "step": 860 }, { "entropy": 0.5891423657536506, "epoch": 0.6952352412346419, "grad_norm": 0.18359375, "learning_rate": 6.167471819645732e-05, "loss": 0.5766, "mean_token_accuracy": 0.8288635179400444, "num_tokens": 1598383.0, "step": 870 }, { "entropy": 0.5979194710031152, "epoch": 0.7032264509040056, "grad_norm": 0.15234375, "learning_rate": 6.006441223832528e-05, "loss": 0.5452, "mean_token_accuracy": 0.8346437945961952, "num_tokens": 1617322.0, "step": 880 }, { "entropy": 0.6381862349808216, "epoch": 0.7112176605733693, "grad_norm": 0.173828125, "learning_rate": 5.8454106280193244e-05, "loss": 0.6008, "mean_token_accuracy": 0.8242271035909653, "num_tokens": 1633941.0, "step": 890 }, { "entropy": 0.5802713014185429, "epoch": 0.719208870242733, "grad_norm": 0.15625, "learning_rate": 5.684380032206119e-05, "loss": 0.5462, "mean_token_accuracy": 0.8342008836567402, "num_tokens": 1651765.0, "step": 900 } ], "logging_steps": 10, "max_steps": 1252, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.51066844059566e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }