| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 13.424242424242424, | |
| "eval_steps": 100, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 0.9522146363556385, | |
| "epoch": 0.16835016835016836, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.9937530517578125, | |
| "mean_token_accuracy": 0.7483265110850335, | |
| "num_tokens": 1591793.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 0.3762529502809048, | |
| "epoch": 0.3367003367003367, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1.6333333333333335e-05, | |
| "loss": 0.3625458145141602, | |
| "mean_token_accuracy": 0.8780256441235542, | |
| "num_tokens": 3174240.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.3107199192792177, | |
| "epoch": 0.5050505050505051, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 1.9997430173759876e-05, | |
| "loss": 0.30434131622314453, | |
| "mean_token_accuracy": 0.8953498190641404, | |
| "num_tokens": 4755519.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 0.2847785219550133, | |
| "epoch": 0.6734006734006734, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 1.9980063397715685e-05, | |
| "loss": 0.28054773330688476, | |
| "mean_token_accuracy": 0.9025071159005165, | |
| "num_tokens": 6334024.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6734006734006734, | |
| "eval_entropy": 0.2793949246406555, | |
| "eval_loss": 0.2698107361793518, | |
| "eval_mean_token_accuracy": 0.9054524502754211, | |
| "eval_num_tokens": 6334024.0, | |
| "eval_runtime": 14.0488, | |
| "eval_samples_per_second": 17.795, | |
| "eval_steps_per_second": 8.898, | |
| "eval_token_accuracy": 0.9051616133787281, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.26899013079702855, | |
| "epoch": 0.8417508417508418, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 1.9946341631587086e-05, | |
| "loss": 0.26340150833129883, | |
| "mean_token_accuracy": 0.9085881695151329, | |
| "num_tokens": 7921274.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 0.24236563958075583, | |
| "epoch": 1.0067340067340067, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 1.9896320137462984e-05, | |
| "loss": 0.23591676712036133, | |
| "mean_token_accuracy": 0.9202974551186269, | |
| "num_tokens": 9458752.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.19062039852142335, | |
| "epoch": 1.1750841750841752, | |
| "grad_norm": 0.1513671875, | |
| "learning_rate": 1.983008088887478e-05, | |
| "loss": 0.18400527954101562, | |
| "mean_token_accuracy": 0.9372689473628998, | |
| "num_tokens": 11049146.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 0.17757252234965562, | |
| "epoch": 1.3434343434343434, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 1.9747732436460955e-05, | |
| "loss": 0.17247369766235351, | |
| "mean_token_accuracy": 0.9412289550900459, | |
| "num_tokens": 12629069.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.3434343434343434, | |
| "eval_entropy": 0.17348989713191987, | |
| "eval_loss": 0.1803748607635498, | |
| "eval_mean_token_accuracy": 0.9387603945732117, | |
| "eval_num_tokens": 12629069.0, | |
| "eval_runtime": 13.9222, | |
| "eval_samples_per_second": 17.957, | |
| "eval_steps_per_second": 8.978, | |
| "eval_token_accuracy": 0.9386888733819055, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.17212937109172344, | |
| "epoch": 1.5117845117845117, | |
| "grad_norm": 0.16015625, | |
| "learning_rate": 1.9649409730077934e-05, | |
| "loss": 0.1680605125427246, | |
| "mean_token_accuracy": 0.9425190502405166, | |
| "num_tokens": 14207351.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 0.16998517245054245, | |
| "epoch": 1.6801346801346801, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 1.9535273897648857e-05, | |
| "loss": 0.1662980079650879, | |
| "mean_token_accuracy": 0.943454926609993, | |
| "num_tokens": 15787859.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.16503205563873052, | |
| "epoch": 1.8484848484848486, | |
| "grad_norm": 0.1552734375, | |
| "learning_rate": 1.9405511981112553e-05, | |
| "loss": 0.16160566329956055, | |
| "mean_token_accuracy": 0.9454404374957085, | |
| "num_tokens": 17374190.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 0.1615225655615938, | |
| "epoch": 2.0134680134680134, | |
| "grad_norm": 0.169921875, | |
| "learning_rate": 1.926033662990558e-05, | |
| "loss": 0.15662331581115724, | |
| "mean_token_accuracy": 0.9472035674416289, | |
| "num_tokens": 18919416.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.0134680134680134, | |
| "eval_entropy": 0.1541357452273369, | |
| "eval_loss": 0.1643277257680893, | |
| "eval_mean_token_accuracy": 0.9455152935981751, | |
| "eval_num_tokens": 18919416.0, | |
| "eval_runtime": 13.8817, | |
| "eval_samples_per_second": 18.009, | |
| "eval_steps_per_second": 9.005, | |
| "eval_token_accuracy": 0.9454900486056786, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.14235603258013727, | |
| "epoch": 2.1818181818181817, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 1.9099985752479505e-05, | |
| "loss": 0.13640413284301758, | |
| "mean_token_accuracy": 0.9537694716453552, | |
| "num_tokens": 20493589.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 0.1364088713005185, | |
| "epoch": 2.3501683501683504, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 1.892472212642459e-05, | |
| "loss": 0.13120804786682128, | |
| "mean_token_accuracy": 0.9558958530426025, | |
| "num_tokens": 22071046.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.1340012726187706, | |
| "epoch": 2.5185185185185186, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 1.8734832967838775e-05, | |
| "loss": 0.12886263847351073, | |
| "mean_token_accuracy": 0.956660538315773, | |
| "num_tokens": 23651892.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 0.13179550088942052, | |
| "epoch": 2.686868686868687, | |
| "grad_norm": 0.1572265625, | |
| "learning_rate": 1.8530629460647658e-05, | |
| "loss": 0.1262987232208252, | |
| "mean_token_accuracy": 0.9577056321501732, | |
| "num_tokens": 25237419.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.686868686868687, | |
| "eval_entropy": 0.13389861524105073, | |
| "eval_loss": 0.1478790044784546, | |
| "eval_mean_token_accuracy": 0.9525656714439392, | |
| "eval_num_tokens": 25237419.0, | |
| "eval_runtime": 14.0025, | |
| "eval_samples_per_second": 17.854, | |
| "eval_steps_per_second": 8.927, | |
| "eval_token_accuracy": 0.9525425978397923, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.12678295791149138, | |
| "epoch": 2.855218855218855, | |
| "grad_norm": 0.1796875, | |
| "learning_rate": 1.831244624664681e-05, | |
| "loss": 0.12145834922790527, | |
| "mean_token_accuracy": 0.9593311008810997, | |
| "num_tokens": 26831783.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 0.12364728927460251, | |
| "epoch": 3.0202020202020203, | |
| "grad_norm": 0.1796875, | |
| "learning_rate": 1.808064087710212e-05, | |
| "loss": 0.11712233543395996, | |
| "mean_token_accuracy": 0.9609322833771609, | |
| "num_tokens": 28376572.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.09804413143545389, | |
| "epoch": 3.1885521885521886, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.7835593226806902e-05, | |
| "loss": 0.0911950397491455, | |
| "mean_token_accuracy": 0.9690286010503769, | |
| "num_tokens": 29961022.0, | |
| "step": 475 | |
| }, | |
| { | |
| "entropy": 0.09506628409028053, | |
| "epoch": 3.356902356902357, | |
| "grad_norm": 0.1650390625, | |
| "learning_rate": 1.7577704871555924e-05, | |
| "loss": 0.08855191230773926, | |
| "mean_token_accuracy": 0.9698488730192184, | |
| "num_tokens": 31553946.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.356902356902357, | |
| "eval_entropy": 0.10651444137096405, | |
| "eval_loss": 0.1412735879421234, | |
| "eval_mean_token_accuracy": 0.9571833200454712, | |
| "eval_num_tokens": 31553946.0, | |
| "eval_runtime": 13.9481, | |
| "eval_samples_per_second": 17.924, | |
| "eval_steps_per_second": 8.962, | |
| "eval_token_accuracy": 0.9571799455825543, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.09585228541865945, | |
| "epoch": 3.525252525252525, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 1.7307398430056595e-05, | |
| "loss": 0.08968615531921387, | |
| "mean_token_accuracy": 0.9697143504023552, | |
| "num_tokens": 33135630.0, | |
| "step": 525 | |
| }, | |
| { | |
| "entropy": 0.10036869466304779, | |
| "epoch": 3.6936026936026938, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 1.7025116871355737e-05, | |
| "loss": 0.09400833129882813, | |
| "mean_token_accuracy": 0.9683772554993629, | |
| "num_tokens": 34707432.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.0957243438065052, | |
| "epoch": 3.861952861952862, | |
| "grad_norm": 0.1865234375, | |
| "learning_rate": 1.6731322788916892e-05, | |
| "loss": 0.08972453117370606, | |
| "mean_token_accuracy": 0.9695299303531647, | |
| "num_tokens": 36292817.0, | |
| "step": 575 | |
| }, | |
| { | |
| "entropy": 0.0924541620841744, | |
| "epoch": 4.026936026936027, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 1.6426497642537826e-05, | |
| "loss": 0.08610689163208007, | |
| "mean_token_accuracy": 0.9709198608690378, | |
| "num_tokens": 37832802.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 4.026936026936027, | |
| "eval_entropy": 0.08664591646194458, | |
| "eval_loss": 0.14084948599338531, | |
| "eval_mean_token_accuracy": 0.9594513201713561, | |
| "eval_num_tokens": 37832802.0, | |
| "eval_runtime": 14.1668, | |
| "eval_samples_per_second": 17.647, | |
| "eval_steps_per_second": 8.823, | |
| "eval_token_accuracy": 0.9594704655647771, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.07393988940864801, | |
| "epoch": 4.1952861952861955, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 1.6111140969350504e-05, | |
| "loss": 0.06671238899230957, | |
| "mean_token_accuracy": 0.9773511955142021, | |
| "num_tokens": 39420574.0, | |
| "step": 625 | |
| }, | |
| { | |
| "entropy": 0.07375508725643158, | |
| "epoch": 4.363636363636363, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 1.5785769565196543e-05, | |
| "loss": 0.06662603855133056, | |
| "mean_token_accuracy": 0.9770343449711799, | |
| "num_tokens": 40995243.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.0700146171823144, | |
| "epoch": 4.531986531986532, | |
| "grad_norm": 0.173828125, | |
| "learning_rate": 1.5450916637719683e-05, | |
| "loss": 0.06388366222381592, | |
| "mean_token_accuracy": 0.9781549483537674, | |
| "num_tokens": 42585578.0, | |
| "step": 675 | |
| }, | |
| { | |
| "entropy": 0.07131696030497552, | |
| "epoch": 4.700336700336701, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 1.5107130932563151e-05, | |
| "loss": 0.0648357915878296, | |
| "mean_token_accuracy": 0.9776528170704841, | |
| "num_tokens": 44164669.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 4.700336700336701, | |
| "eval_entropy": 0.08206569665670395, | |
| "eval_loss": 0.13808086514472961, | |
| "eval_mean_token_accuracy": 0.9618017344474793, | |
| "eval_num_tokens": 44164669.0, | |
| "eval_runtime": 13.9704, | |
| "eval_samples_per_second": 17.895, | |
| "eval_steps_per_second": 8.947, | |
| "eval_token_accuracy": 0.9618092493569853, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.06924619141966104, | |
| "epoch": 4.8686868686868685, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 1.4754975834103877e-05, | |
| "loss": 0.06314377784729004, | |
| "mean_token_accuracy": 0.9783227249979973, | |
| "num_tokens": 45746386.0, | |
| "step": 725 | |
| }, | |
| { | |
| "entropy": 0.06477328384181066, | |
| "epoch": 5.033670033670034, | |
| "grad_norm": 0.1552734375, | |
| "learning_rate": 1.4395028442197231e-05, | |
| "loss": 0.057736573219299314, | |
| "mean_token_accuracy": 0.9802252127199756, | |
| "num_tokens": 47291064.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.05126072809100151, | |
| "epoch": 5.202020202020202, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 1.4027878626445339e-05, | |
| "loss": 0.04461108207702637, | |
| "mean_token_accuracy": 0.984737733900547, | |
| "num_tokens": 48874157.0, | |
| "step": 775 | |
| }, | |
| { | |
| "entropy": 0.05017553111538291, | |
| "epoch": 5.37037037037037, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 1.365412805953872e-05, | |
| "loss": 0.043688578605651854, | |
| "mean_token_accuracy": 0.9849283066391945, | |
| "num_tokens": 50457004.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 5.37037037037037, | |
| "eval_entropy": 0.0671493958979845, | |
| "eval_loss": 0.15137650072574615, | |
| "eval_mean_token_accuracy": 0.9625004653930664, | |
| "eval_num_tokens": 50457004.0, | |
| "eval_runtime": 13.8354, | |
| "eval_samples_per_second": 18.07, | |
| "eval_steps_per_second": 9.035, | |
| "eval_token_accuracy": 0.9625030416255251, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.049754718681797386, | |
| "epoch": 5.538720538720539, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 1.3274389231255466e-05, | |
| "loss": 0.043588805198669436, | |
| "mean_token_accuracy": 0.9849756741523743, | |
| "num_tokens": 52047687.0, | |
| "step": 825 | |
| }, | |
| { | |
| "entropy": 0.048861624505370856, | |
| "epoch": 5.707070707070707, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 1.2889284444733722e-05, | |
| "loss": 0.04272536277770996, | |
| "mean_token_accuracy": 0.9852952674031258, | |
| "num_tokens": 53626751.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 0.049314973950386046, | |
| "epoch": 5.875420875420875, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 1.2499444796662354e-05, | |
| "loss": 0.04330784320831299, | |
| "mean_token_accuracy": 0.9850323754549026, | |
| "num_tokens": 55212090.0, | |
| "step": 875 | |
| }, | |
| { | |
| "entropy": 0.04571919083329184, | |
| "epoch": 6.040404040404041, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 1.2105509143061072e-05, | |
| "loss": 0.03899869203567505, | |
| "mean_token_accuracy": 0.9866329042278991, | |
| "num_tokens": 56756053.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 6.040404040404041, | |
| "eval_entropy": 0.055020518347620964, | |
| "eval_loss": 0.1728222817182541, | |
| "eval_mean_token_accuracy": 0.9626847972869873, | |
| "eval_num_tokens": 56756053.0, | |
| "eval_runtime": 13.8671, | |
| "eval_samples_per_second": 18.028, | |
| "eval_steps_per_second": 9.014, | |
| "eval_token_accuracy": 0.9626900638892185, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 0.034643625281751154, | |
| "epoch": 6.2087542087542085, | |
| "grad_norm": 0.12109375, | |
| "learning_rate": 1.1708123052344803e-05, | |
| "loss": 0.028563168048858643, | |
| "mean_token_accuracy": 0.9902752894163132, | |
| "num_tokens": 58343293.0, | |
| "step": 925 | |
| }, | |
| { | |
| "entropy": 0.03522825219668448, | |
| "epoch": 6.377104377104377, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 1.1307937747388034e-05, | |
| "loss": 0.028969509601593016, | |
| "mean_token_accuracy": 0.9901717621088028, | |
| "num_tokens": 59924721.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 0.034883827781304715, | |
| "epoch": 6.545454545454545, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 1.090560903832278e-05, | |
| "loss": 0.028860065937042236, | |
| "mean_token_accuracy": 0.9901738902926445, | |
| "num_tokens": 61500807.0, | |
| "step": 975 | |
| }, | |
| { | |
| "entropy": 0.03482985371723771, | |
| "epoch": 6.713804713804714, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 1.0501796247819176e-05, | |
| "loss": 0.02919963836669922, | |
| "mean_token_accuracy": 0.9900345727801323, | |
| "num_tokens": 63084518.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 6.713804713804714, | |
| "eval_entropy": 0.0541005334854126, | |
| "eval_loss": 0.16875509917736053, | |
| "eval_mean_token_accuracy": 0.9639611830711364, | |
| "eval_num_tokens": 63084518.0, | |
| "eval_runtime": 13.9974, | |
| "eval_samples_per_second": 17.86, | |
| "eval_steps_per_second": 8.93, | |
| "eval_token_accuracy": 0.963954977909252, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 0.03346874587237835, | |
| "epoch": 6.882154882154882, | |
| "grad_norm": 0.1640625, | |
| "learning_rate": 1.0097161130609774e-05, | |
| "loss": 0.027893943786621092, | |
| "mean_token_accuracy": 0.9905096918344498, | |
| "num_tokens": 64660786.0, | |
| "step": 1025 | |
| }, | |
| { | |
| "entropy": 0.03328982365260623, | |
| "epoch": 7.047138047138047, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 9.692366789028308e-06, | |
| "loss": 0.026968984603881835, | |
| "mean_token_accuracy": 0.990928383201969, | |
| "num_tokens": 66215390.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 0.024703437695279717, | |
| "epoch": 7.215488215488215, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 9.288076586340005e-06, | |
| "loss": 0.018726770877838136, | |
| "mean_token_accuracy": 0.9938653546571732, | |
| "num_tokens": 67789474.0, | |
| "step": 1075 | |
| }, | |
| { | |
| "entropy": 0.02491601270623505, | |
| "epoch": 7.383838383838384, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 8.88495305964436e-06, | |
| "loss": 0.019079389572143553, | |
| "mean_token_accuracy": 0.9937083786725998, | |
| "num_tokens": 69374157.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 7.383838383838384, | |
| "eval_entropy": 0.04581953993439675, | |
| "eval_loss": 0.19364017248153687, | |
| "eval_mean_token_accuracy": 0.9638748369216918, | |
| "eval_num_tokens": 69374157.0, | |
| "eval_runtime": 13.8268, | |
| "eval_samples_per_second": 18.081, | |
| "eval_steps_per_second": 9.04, | |
| "eval_token_accuracy": 0.9638604612813639, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 0.024102514907717705, | |
| "epoch": 7.552188552188552, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 8.48365683413172e-06, | |
| "loss": 0.018542931079864503, | |
| "mean_token_accuracy": 0.9938124868273736, | |
| "num_tokens": 70951571.0, | |
| "step": 1125 | |
| }, | |
| { | |
| "entropy": 0.023655354911461472, | |
| "epoch": 7.720538720538721, | |
| "grad_norm": 0.12451171875, | |
| "learning_rate": 8.084845540473127e-06, | |
| "loss": 0.0183677613735199, | |
| "mean_token_accuracy": 0.9938945046067238, | |
| "num_tokens": 72531150.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 0.02381689740344882, | |
| "epoch": 7.888888888888889, | |
| "grad_norm": 0.12109375, | |
| "learning_rate": 7.689172737117389e-06, | |
| "loss": 0.018559412956237795, | |
| "mean_token_accuracy": 0.9938493025302887, | |
| "num_tokens": 74120322.0, | |
| "step": 1175 | |
| }, | |
| { | |
| "entropy": 0.021846223494266064, | |
| "epoch": 8.053872053872054, | |
| "grad_norm": 0.09765625, | |
| "learning_rate": 7.297286839261659e-06, | |
| "loss": 0.016244869232177734, | |
| "mean_token_accuracy": 0.9946904477416253, | |
| "num_tokens": 75671469.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 8.053872053872054, | |
| "eval_entropy": 0.04140873943269253, | |
| "eval_loss": 0.21278557181358337, | |
| "eval_mean_token_accuracy": 0.9638264698982238, | |
| "eval_num_tokens": 75671469.0, | |
| "eval_runtime": 13.8279, | |
| "eval_samples_per_second": 18.079, | |
| "eval_steps_per_second": 9.04, | |
| "eval_token_accuracy": 0.9638101864792958, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 0.018266186909750105, | |
| "epoch": 8.222222222222221, | |
| "grad_norm": 0.0966796875, | |
| "learning_rate": 6.909830056250527e-06, | |
| "loss": 0.012853317260742188, | |
| "mean_token_accuracy": 0.9959348595142364, | |
| "num_tokens": 77255219.0, | |
| "step": 1225 | |
| }, | |
| { | |
| "entropy": 0.017858617678284644, | |
| "epoch": 8.390572390572391, | |
| "grad_norm": 0.107421875, | |
| "learning_rate": 6.527437339145097e-06, | |
| "loss": 0.012499079704284669, | |
| "mean_token_accuracy": 0.9960082057118416, | |
| "num_tokens": 78840351.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 0.018353314991109072, | |
| "epoch": 8.558922558922559, | |
| "grad_norm": 0.11376953125, | |
| "learning_rate": 6.1507353401866896e-06, | |
| "loss": 0.01300368070602417, | |
| "mean_token_accuracy": 0.9958306196331977, | |
| "num_tokens": 80420094.0, | |
| "step": 1275 | |
| }, | |
| { | |
| "entropy": 0.017453019949607552, | |
| "epoch": 8.727272727272727, | |
| "grad_norm": 0.103515625, | |
| "learning_rate": 5.780341385860333e-06, | |
| "loss": 0.012396881580352783, | |
| "mean_token_accuracy": 0.9960150149464607, | |
| "num_tokens": 82008321.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 8.727272727272727, | |
| "eval_entropy": 0.03956229141354561, | |
| "eval_loss": 0.22291041910648346, | |
| "eval_mean_token_accuracy": 0.9638104586601257, | |
| "eval_num_tokens": 82008321.0, | |
| "eval_runtime": 13.8719, | |
| "eval_samples_per_second": 18.022, | |
| "eval_steps_per_second": 9.011, | |
| "eval_token_accuracy": 0.9637880655663859, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 0.01774411663878709, | |
| "epoch": 8.895622895622896, | |
| "grad_norm": 0.10302734375, | |
| "learning_rate": 5.416862465241033e-06, | |
| "loss": 0.012684780359268188, | |
| "mean_token_accuracy": 0.9958784037828445, | |
| "num_tokens": 83583971.0, | |
| "step": 1325 | |
| }, | |
| { | |
| "entropy": 0.01662842344435654, | |
| "epoch": 9.06060606060606, | |
| "grad_norm": 0.0751953125, | |
| "learning_rate": 5.060894235280637e-06, | |
| "loss": 0.011442055702209472, | |
| "mean_token_accuracy": 0.9964186977975222, | |
| "num_tokens": 85130709.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 0.01486415593419224, | |
| "epoch": 9.228956228956228, | |
| "grad_norm": 0.09521484375, | |
| "learning_rate": 4.713020044665348e-06, | |
| "loss": 0.009846681952476502, | |
| "mean_token_accuracy": 0.9969244155287743, | |
| "num_tokens": 86706204.0, | |
| "step": 1375 | |
| }, | |
| { | |
| "entropy": 0.015145676247775554, | |
| "epoch": 9.397306397306398, | |
| "grad_norm": 0.0869140625, | |
| "learning_rate": 4.373809977843676e-06, | |
| "loss": 0.010009793043136596, | |
| "mean_token_accuracy": 0.9968555930256844, | |
| "num_tokens": 88290356.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 9.397306397306398, | |
| "eval_entropy": 0.03756700618565083, | |
| "eval_loss": 0.23951546847820282, | |
| "eval_mean_token_accuracy": 0.9634870853424072, | |
| "eval_num_tokens": 88290356.0, | |
| "eval_runtime": 13.8742, | |
| "eval_samples_per_second": 18.019, | |
| "eval_steps_per_second": 9.009, | |
| "eval_token_accuracy": 0.9634723398093982, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 0.014698740621097385, | |
| "epoch": 9.565656565656566, | |
| "grad_norm": 0.08056640625, | |
| "learning_rate": 4.043819920791322e-06, | |
| "loss": 0.00973641276359558, | |
| "mean_token_accuracy": 0.996994196176529, | |
| "num_tokens": 89873272.0, | |
| "step": 1425 | |
| }, | |
| { | |
| "entropy": 0.014629123574122787, | |
| "epoch": 9.734006734006734, | |
| "grad_norm": 0.0927734375, | |
| "learning_rate": 3.7235906500440576e-06, | |
| "loss": 0.00979375422000885, | |
| "mean_token_accuracy": 0.9969650763273239, | |
| "num_tokens": 91455959.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 0.014975303104147315, | |
| "epoch": 9.902356902356903, | |
| "grad_norm": 0.0849609375, | |
| "learning_rate": 3.413646946491458e-06, | |
| "loss": 0.009887872934341431, | |
| "mean_token_accuracy": 0.9969008722901345, | |
| "num_tokens": 93038545.0, | |
| "step": 1475 | |
| }, | |
| { | |
| "entropy": 0.014437279744757985, | |
| "epoch": 10.067340067340067, | |
| "grad_norm": 0.0654296875, | |
| "learning_rate": 3.1144967353837196e-06, | |
| "loss": 0.009359861612319947, | |
| "mean_token_accuracy": 0.9971185049840382, | |
| "num_tokens": 94591222.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 10.067340067340067, | |
| "eval_entropy": 0.03697392378747463, | |
| "eval_loss": 0.24305777251720428, | |
| "eval_mean_token_accuracy": 0.9633980007171631, | |
| "eval_num_tokens": 94591222.0, | |
| "eval_runtime": 13.8912, | |
| "eval_samples_per_second": 17.997, | |
| "eval_steps_per_second": 8.999, | |
| "eval_token_accuracy": 0.9633818451656756, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 0.013786709853447973, | |
| "epoch": 10.235690235690235, | |
| "grad_norm": 0.068359375, | |
| "learning_rate": 2.8266302539609747e-06, | |
| "loss": 0.008779547214508056, | |
| "mean_token_accuracy": 0.9973234468698502, | |
| "num_tokens": 96172254.0, | |
| "step": 1525 | |
| }, | |
| { | |
| "entropy": 0.013736034976318479, | |
| "epoch": 10.404040404040405, | |
| "grad_norm": 0.07861328125, | |
| "learning_rate": 2.5505192480690865e-06, | |
| "loss": 0.008723703622817993, | |
| "mean_token_accuracy": 0.9973191824555397, | |
| "num_tokens": 97747622.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 0.01360086859203875, | |
| "epoch": 10.572390572390573, | |
| "grad_norm": 0.08154296875, | |
| "learning_rate": 2.2866161990785228e-06, | |
| "loss": 0.008712610602378846, | |
| "mean_token_accuracy": 0.997349898815155, | |
| "num_tokens": 99335839.0, | |
| "step": 1575 | |
| }, | |
| { | |
| "entropy": 0.013570407247170806, | |
| "epoch": 10.74074074074074, | |
| "grad_norm": 0.0771484375, | |
| "learning_rate": 2.0353535823732053e-06, | |
| "loss": 0.008633202314376831, | |
| "mean_token_accuracy": 0.9973410308361054, | |
| "num_tokens": 100917282.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 10.74074074074074, | |
| "eval_entropy": 0.03649875710904598, | |
| "eval_loss": 0.2482212781906128, | |
| "eval_mean_token_accuracy": 0.9633483939170837, | |
| "eval_num_tokens": 100917282.0, | |
| "eval_runtime": 13.819, | |
| "eval_samples_per_second": 18.091, | |
| "eval_steps_per_second": 9.045, | |
| "eval_token_accuracy": 0.9633335813556902, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 0.013947828765958547, | |
| "epoch": 10.909090909090908, | |
| "grad_norm": 0.07958984375, | |
| "learning_rate": 1.7971431586244814e-06, | |
| "loss": 0.008911921381950379, | |
| "mean_token_accuracy": 0.9972717782855034, | |
| "num_tokens": 102499386.0, | |
| "step": 1625 | |
| }, | |
| { | |
| "entropy": 0.013590329364702409, | |
| "epoch": 11.074074074074074, | |
| "grad_norm": 0.0703125, | |
| "learning_rate": 1.5723752990116948e-06, | |
| "loss": 0.008526145815849304, | |
| "mean_token_accuracy": 0.9974123969370005, | |
| "num_tokens": 104048535.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 0.013840707880444825, | |
| "epoch": 11.242424242424242, | |
| "grad_norm": 0.08154296875, | |
| "learning_rate": 1.3614183454950824e-06, | |
| "loss": 0.00863082766532898, | |
| "mean_token_accuracy": 0.9974041077494621, | |
| "num_tokens": 105626902.0, | |
| "step": 1675 | |
| }, | |
| { | |
| "entropy": 0.01306802561506629, | |
| "epoch": 11.41077441077441, | |
| "grad_norm": 0.0791015625, | |
| "learning_rate": 1.1646180071894608e-06, | |
| "loss": 0.008170877695083617, | |
| "mean_token_accuracy": 0.9975323963165283, | |
| "num_tokens": 107208502.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 11.41077441077441, | |
| "eval_entropy": 0.036279944390058515, | |
| "eval_loss": 0.25036948919296265, | |
| "eval_mean_token_accuracy": 0.9632471032142639, | |
| "eval_num_tokens": 107208502.0, | |
| "eval_runtime": 13.8985, | |
| "eval_samples_per_second": 17.988, | |
| "eval_steps_per_second": 8.994, | |
| "eval_token_accuracy": 0.963233031751554, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 0.013187658675014972, | |
| "epoch": 11.57912457912458, | |
| "grad_norm": 0.07861328125, | |
| "learning_rate": 9.822967938278172e-07, | |
| "loss": 0.008353355526924133, | |
| "mean_token_accuracy": 0.9974633774161339, | |
| "num_tokens": 108793895.0, | |
| "step": 1725 | |
| }, | |
| { | |
| "entropy": 0.013317351192235947, | |
| "epoch": 11.747474747474747, | |
| "grad_norm": 0.07177734375, | |
| "learning_rate": 8.147534872432761e-07, | |
| "loss": 0.008341993093490601, | |
| "mean_token_accuracy": 0.9975066068768501, | |
| "num_tokens": 110382929.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 0.013570899264886976, | |
| "epoch": 11.915824915824915, | |
| "grad_norm": 0.07763671875, | |
| "learning_rate": 6.622626517355557e-07, | |
| "loss": 0.008468236327171325, | |
| "mean_token_accuracy": 0.9973884886503219, | |
| "num_tokens": 111971394.0, | |
| "step": 1775 | |
| }, | |
| { | |
| "entropy": 0.013678188079359884, | |
| "epoch": 12.080808080808081, | |
| "grad_norm": 0.0771484375, | |
| "learning_rate": 5.250741841242735e-07, | |
| "loss": 0.008458149433135987, | |
| "mean_token_accuracy": 0.9974453488782961, | |
| "num_tokens": 113511798.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 12.080808080808081, | |
| "eval_entropy": 0.036235544994473456, | |
| "eval_loss": 0.2506777346134186, | |
| "eval_mean_token_accuracy": 0.9632981281280517, | |
| "eval_num_tokens": 113511798.0, | |
| "eval_runtime": 13.8333, | |
| "eval_samples_per_second": 18.072, | |
| "eval_steps_per_second": 9.036, | |
| "eval_token_accuracy": 0.9632792845694567, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 0.013473317860625684, | |
| "epoch": 12.24915824915825, | |
| "grad_norm": 0.06884765625, | |
| "learning_rate": 4.034129042265067e-07, | |
| "loss": 0.008400481343269348, | |
| "mean_token_accuracy": 0.9974551931023597, | |
| "num_tokens": 115097998.0, | |
| "step": 1825 | |
| }, | |
| { | |
| "entropy": 0.01321539796423167, | |
| "epoch": 12.417508417508417, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 2.974781864296783e-07, | |
| "loss": 0.008239768147468567, | |
| "mean_token_accuracy": 0.9975069260597229, | |
| "num_tokens": 116678393.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 0.013363241832703352, | |
| "epoch": 12.585858585858587, | |
| "grad_norm": 0.072265625, | |
| "learning_rate": 2.0744363296356872e-07, | |
| "loss": 0.008337837457656861, | |
| "mean_token_accuracy": 0.9975020122528077, | |
| "num_tokens": 118256344.0, | |
| "step": 1875 | |
| }, | |
| { | |
| "entropy": 0.013509186143055558, | |
| "epoch": 12.754208754208754, | |
| "grad_norm": 0.0751953125, | |
| "learning_rate": 1.3345678940684615e-07, | |
| "loss": 0.008378518223762512, | |
| "mean_token_accuracy": 0.9974912297725678, | |
| "num_tokens": 119840932.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 12.754208754208754, | |
| "eval_entropy": 0.036245557606220245, | |
| "eval_loss": 0.2508084177970886, | |
| "eval_mean_token_accuracy": 0.9633139533996582, | |
| "eval_num_tokens": 119840932.0, | |
| "eval_runtime": 13.8832, | |
| "eval_samples_per_second": 18.007, | |
| "eval_steps_per_second": 9.004, | |
| "eval_token_accuracy": 0.9632933615140358, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 0.013419822426512837, | |
| "epoch": 12.922558922558922, | |
| "grad_norm": 0.07861328125, | |
| "learning_rate": 7.563890289437825e-08, | |
| "loss": 0.008330971598625184, | |
| "mean_token_accuracy": 0.9974869236350059, | |
| "num_tokens": 121419896.0, | |
| "step": 1925 | |
| }, | |
| { | |
| "entropy": 0.013293538674978274, | |
| "epoch": 13.087542087542088, | |
| "grad_norm": 0.078125, | |
| "learning_rate": 3.408472342152136e-08, | |
| "loss": 0.008297484517097473, | |
| "mean_token_accuracy": 0.9974846158708844, | |
| "num_tokens": 122967645.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 0.013380372445099056, | |
| "epoch": 13.255892255892256, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 8.862348571043733e-09, | |
| "loss": 0.008304932117462159, | |
| "mean_token_accuracy": 0.9974824267625809, | |
| "num_tokens": 124542187.0, | |
| "step": 1975 | |
| }, | |
| { | |
| "entropy": 0.013427549134939909, | |
| "epoch": 13.424242424242424, | |
| "grad_norm": 0.07763671875, | |
| "learning_rate": 1.311191710651194e-11, | |
| "loss": 0.008341950178146363, | |
| "mean_token_accuracy": 0.9974777749180794, | |
| "num_tokens": 126127250.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 13.424242424242424, | |
| "eval_entropy": 0.036229205310344695, | |
| "eval_loss": 0.2507854700088501, | |
| "eval_mean_token_accuracy": 0.9632595100402832, | |
| "eval_num_tokens": 126127250.0, | |
| "eval_runtime": 13.8215, | |
| "eval_samples_per_second": 18.088, | |
| "eval_steps_per_second": 9.044, | |
| "eval_token_accuracy": 0.9632390647278022, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 2000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 14, | |
| "save_steps": 400, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.700374075876745e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |