| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9745042492917846, | |
| "eval_steps": 500, | |
| "global_step": 264, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0113314447592068, | |
| "grad_norm": 23.53424926746809, | |
| "learning_rate": 3.7037037037037036e-07, | |
| "loss": 1.4032, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0226628895184136, | |
| "grad_norm": 23.298712049647957, | |
| "learning_rate": 7.407407407407407e-07, | |
| "loss": 1.4235, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0339943342776204, | |
| "grad_norm": 25.15418991437702, | |
| "learning_rate": 1.111111111111111e-06, | |
| "loss": 1.3918, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0453257790368272, | |
| "grad_norm": 23.513705846381587, | |
| "learning_rate": 1.4814814814814815e-06, | |
| "loss": 1.3583, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.056657223796033995, | |
| "grad_norm": 18.732779423937743, | |
| "learning_rate": 1.8518518518518519e-06, | |
| "loss": 1.2546, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0679886685552408, | |
| "grad_norm": 19.557080948103327, | |
| "learning_rate": 2.222222222222222e-06, | |
| "loss": 1.2998, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.07932011331444759, | |
| "grad_norm": 10.724009151468064, | |
| "learning_rate": 2.5925925925925925e-06, | |
| "loss": 1.2226, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.0906515580736544, | |
| "grad_norm": 4.775800267490731, | |
| "learning_rate": 2.962962962962963e-06, | |
| "loss": 1.0559, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.10198300283286119, | |
| "grad_norm": 3.94648433253879, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 1.0475, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.11331444759206799, | |
| "grad_norm": 3.2410802138408448, | |
| "learning_rate": 3.7037037037037037e-06, | |
| "loss": 0.9999, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.12464589235127478, | |
| "grad_norm": 3.409262196892178, | |
| "learning_rate": 4.074074074074074e-06, | |
| "loss": 0.9393, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.1359773371104816, | |
| "grad_norm": 2.7175044653926625, | |
| "learning_rate": 4.444444444444444e-06, | |
| "loss": 0.9487, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.14730878186968838, | |
| "grad_norm": 2.3048951821321078, | |
| "learning_rate": 4.814814814814815e-06, | |
| "loss": 0.9302, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.15864022662889518, | |
| "grad_norm": 2.172665075682734, | |
| "learning_rate": 5.185185185185185e-06, | |
| "loss": 0.9152, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.16997167138810199, | |
| "grad_norm": 2.1574147876066445, | |
| "learning_rate": 5.555555555555557e-06, | |
| "loss": 0.828, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.1813031161473088, | |
| "grad_norm": 2.097885575557383, | |
| "learning_rate": 5.925925925925926e-06, | |
| "loss": 0.8426, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.19263456090651557, | |
| "grad_norm": 1.7637510926108797, | |
| "learning_rate": 6.296296296296297e-06, | |
| "loss": 0.8096, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.20396600566572237, | |
| "grad_norm": 1.6562239272452715, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.838, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.21529745042492918, | |
| "grad_norm": 1.4205229302221682, | |
| "learning_rate": 7.0370370370370375e-06, | |
| "loss": 0.7763, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.22662889518413598, | |
| "grad_norm": 1.4262379616902559, | |
| "learning_rate": 7.4074074074074075e-06, | |
| "loss": 0.7698, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.23796033994334279, | |
| "grad_norm": 1.6494892959766825, | |
| "learning_rate": 7.77777777777778e-06, | |
| "loss": 0.7665, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.24929178470254956, | |
| "grad_norm": 1.4334685604983732, | |
| "learning_rate": 8.148148148148148e-06, | |
| "loss": 0.7822, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.26062322946175637, | |
| "grad_norm": 1.3849818905239097, | |
| "learning_rate": 8.518518518518519e-06, | |
| "loss": 0.7283, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.2719546742209632, | |
| "grad_norm": 1.341658865495544, | |
| "learning_rate": 8.888888888888888e-06, | |
| "loss": 0.6999, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.28328611898017, | |
| "grad_norm": 1.30493584130229, | |
| "learning_rate": 9.25925925925926e-06, | |
| "loss": 0.7367, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.29461756373937675, | |
| "grad_norm": 1.400982166207809, | |
| "learning_rate": 9.62962962962963e-06, | |
| "loss": 0.7212, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.3059490084985836, | |
| "grad_norm": 1.2376220091039114, | |
| "learning_rate": 1e-05, | |
| "loss": 0.7211, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.31728045325779036, | |
| "grad_norm": 1.3155466485701666, | |
| "learning_rate": 9.999560724782173e-06, | |
| "loss": 0.7194, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.3286118980169972, | |
| "grad_norm": 1.1452371029975463, | |
| "learning_rate": 9.998242976313777e-06, | |
| "loss": 0.7205, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.33994334277620397, | |
| "grad_norm": 1.166971440865611, | |
| "learning_rate": 9.99604698613651e-06, | |
| "loss": 0.7097, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.35127478753541075, | |
| "grad_norm": 1.3261901880488491, | |
| "learning_rate": 9.992973140107998e-06, | |
| "loss": 0.6974, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.3626062322946176, | |
| "grad_norm": 1.2464894375774034, | |
| "learning_rate": 9.989021978333996e-06, | |
| "loss": 0.7082, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.37393767705382436, | |
| "grad_norm": 1.1355726846388239, | |
| "learning_rate": 9.98419419507348e-06, | |
| "loss": 0.6734, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.38526912181303113, | |
| "grad_norm": 1.0490944265922426, | |
| "learning_rate": 9.978490638616671e-06, | |
| "loss": 0.6853, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.39660056657223797, | |
| "grad_norm": 1.2600398341735712, | |
| "learning_rate": 9.971912311135967e-06, | |
| "loss": 0.6703, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.40793201133144474, | |
| "grad_norm": 1.1344614916090783, | |
| "learning_rate": 9.964460368509868e-06, | |
| "loss": 0.6841, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.4192634560906516, | |
| "grad_norm": 1.1274773706270436, | |
| "learning_rate": 9.956136120119858e-06, | |
| "loss": 0.6817, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.43059490084985835, | |
| "grad_norm": 1.2113014190849545, | |
| "learning_rate": 9.946941028620349e-06, | |
| "loss": 0.6837, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.44192634560906513, | |
| "grad_norm": 1.172097274900492, | |
| "learning_rate": 9.936876709681668e-06, | |
| "loss": 0.6678, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.45325779036827196, | |
| "grad_norm": 1.2490078546533159, | |
| "learning_rate": 9.925944931706174e-06, | |
| "loss": 0.7413, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.46458923512747874, | |
| "grad_norm": 1.140086857717275, | |
| "learning_rate": 9.914147615517527e-06, | |
| "loss": 0.6778, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.47592067988668557, | |
| "grad_norm": 1.2223423143241732, | |
| "learning_rate": 9.901486834023182e-06, | |
| "loss": 0.7388, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.48725212464589235, | |
| "grad_norm": 1.2452223599483243, | |
| "learning_rate": 9.887964811850159e-06, | |
| "loss": 0.691, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.4985835694050991, | |
| "grad_norm": 1.1350515055455908, | |
| "learning_rate": 9.873583924954152e-06, | |
| "loss": 0.6593, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.509915014164306, | |
| "grad_norm": 1.0405719380983063, | |
| "learning_rate": 9.85834670020205e-06, | |
| "loss": 0.6351, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.5212464589235127, | |
| "grad_norm": 1.3303109757890985, | |
| "learning_rate": 9.842255814927945e-06, | |
| "loss": 0.6404, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.5325779036827195, | |
| "grad_norm": 1.1787017310861478, | |
| "learning_rate": 9.825314096462686e-06, | |
| "loss": 0.6858, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.5439093484419264, | |
| "grad_norm": 1.1028621034116284, | |
| "learning_rate": 9.807524521637103e-06, | |
| "loss": 0.6554, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.5552407932011332, | |
| "grad_norm": 1.0192876247663198, | |
| "learning_rate": 9.78889021625894e-06, | |
| "loss": 0.6581, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.56657223796034, | |
| "grad_norm": 1.0981773991994468, | |
| "learning_rate": 9.769414454563614e-06, | |
| "loss": 0.6873, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.5779036827195467, | |
| "grad_norm": 1.080964680948062, | |
| "learning_rate": 9.749100658638914e-06, | |
| "loss": 0.6313, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.5892351274787535, | |
| "grad_norm": 1.060635241593271, | |
| "learning_rate": 9.72795239782369e-06, | |
| "loss": 0.657, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.6005665722379604, | |
| "grad_norm": 1.1436681010237095, | |
| "learning_rate": 9.705973388080694e-06, | |
| "loss": 0.6521, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.6118980169971672, | |
| "grad_norm": 1.0838029458150678, | |
| "learning_rate": 9.68316749134364e-06, | |
| "loss": 0.6712, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.623229461756374, | |
| "grad_norm": 1.0579456798759823, | |
| "learning_rate": 9.659538714838635e-06, | |
| "loss": 0.6439, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.6345609065155807, | |
| "grad_norm": 1.000408593357701, | |
| "learning_rate": 9.635091210380052e-06, | |
| "loss": 0.6164, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.6458923512747875, | |
| "grad_norm": 1.0871122101771147, | |
| "learning_rate": 9.609829273641034e-06, | |
| "loss": 0.6561, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.6572237960339944, | |
| "grad_norm": 1.0392258903623652, | |
| "learning_rate": 9.583757343398685e-06, | |
| "loss": 0.6353, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.6685552407932012, | |
| "grad_norm": 1.0694855168162771, | |
| "learning_rate": 9.55688000075414e-06, | |
| "loss": 0.672, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.6798866855524079, | |
| "grad_norm": 1.0818048041242603, | |
| "learning_rate": 9.529201968327618e-06, | |
| "loss": 0.6649, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.6912181303116147, | |
| "grad_norm": 1.122154267801109, | |
| "learning_rate": 9.500728109428603e-06, | |
| "loss": 0.6338, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.7025495750708215, | |
| "grad_norm": 1.0115716268572774, | |
| "learning_rate": 9.47146342720133e-06, | |
| "loss": 0.6404, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.7138810198300283, | |
| "grad_norm": 1.060628179091387, | |
| "learning_rate": 9.44141306374566e-06, | |
| "loss": 0.6491, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.7252124645892352, | |
| "grad_norm": 1.0433876035374046, | |
| "learning_rate": 9.410582299213574e-06, | |
| "loss": 0.6131, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.7365439093484419, | |
| "grad_norm": 1.0724446453489962, | |
| "learning_rate": 9.378976550881393e-06, | |
| "loss": 0.645, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.7478753541076487, | |
| "grad_norm": 16.698318216158572, | |
| "learning_rate": 9.346601372197914e-06, | |
| "loss": 0.628, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.7592067988668555, | |
| "grad_norm": 1.088611623094774, | |
| "learning_rate": 9.3134624518086e-06, | |
| "loss": 0.651, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.7705382436260623, | |
| "grad_norm": 1.08573159288467, | |
| "learning_rate": 9.279565612556043e-06, | |
| "loss": 0.6913, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.7818696883852692, | |
| "grad_norm": 1.287771998076043, | |
| "learning_rate": 9.244916810456822e-06, | |
| "loss": 0.6167, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.7932011331444759, | |
| "grad_norm": 1.0734450115631073, | |
| "learning_rate": 9.20952213365497e-06, | |
| "loss": 0.6048, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.8045325779036827, | |
| "grad_norm": 1.041169203868327, | |
| "learning_rate": 9.173387801352232e-06, | |
| "loss": 0.622, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.8158640226628895, | |
| "grad_norm": 1.0441941562582049, | |
| "learning_rate": 9.136520162715288e-06, | |
| "loss": 0.636, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.8271954674220963, | |
| "grad_norm": 1.0266494367822185, | |
| "learning_rate": 9.098925695760132e-06, | |
| "loss": 0.641, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.8385269121813032, | |
| "grad_norm": 1.0525228370033899, | |
| "learning_rate": 9.060611006213833e-06, | |
| "loss": 0.605, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.8498583569405099, | |
| "grad_norm": 1.0169561500024211, | |
| "learning_rate": 9.021582826353825e-06, | |
| "loss": 0.6691, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.8611898016997167, | |
| "grad_norm": 1.0482739302531685, | |
| "learning_rate": 8.981848013824995e-06, | |
| "loss": 0.6658, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.8725212464589235, | |
| "grad_norm": 1.0794377750181379, | |
| "learning_rate": 8.94141355043471e-06, | |
| "loss": 0.6578, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.8838526912181303, | |
| "grad_norm": 1.0439742131558416, | |
| "learning_rate": 8.900286540926062e-06, | |
| "loss": 0.6138, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.8951841359773371, | |
| "grad_norm": 1.072198566934302, | |
| "learning_rate": 8.85847421172947e-06, | |
| "loss": 0.6313, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.9065155807365439, | |
| "grad_norm": 1.0570789500714661, | |
| "learning_rate": 8.815983909692941e-06, | |
| "loss": 0.611, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.9178470254957507, | |
| "grad_norm": 0.9747424186741095, | |
| "learning_rate": 8.772823100791152e-06, | |
| "loss": 0.6235, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.9291784702549575, | |
| "grad_norm": 0.9650403389286071, | |
| "learning_rate": 8.728999368813591e-06, | |
| "loss": 0.6289, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.9405099150141643, | |
| "grad_norm": 1.0608225953186365, | |
| "learning_rate": 8.684520414032023e-06, | |
| "loss": 0.6534, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.9518413597733711, | |
| "grad_norm": 1.0400599060401146, | |
| "learning_rate": 8.639394051847472e-06, | |
| "loss": 0.6351, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.9631728045325779, | |
| "grad_norm": 1.029029843151287, | |
| "learning_rate": 8.593628211416964e-06, | |
| "loss": 0.637, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.9745042492917847, | |
| "grad_norm": 0.9884213872615792, | |
| "learning_rate": 8.547230934260313e-06, | |
| "loss": 0.6414, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.9858356940509915, | |
| "grad_norm": 1.0448881569178157, | |
| "learning_rate": 8.500210372847128e-06, | |
| "loss": 0.6234, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.9971671388101983, | |
| "grad_norm": 1.0141851489732272, | |
| "learning_rate": 8.452574789164352e-06, | |
| "loss": 0.636, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.0141851489732272, | |
| "learning_rate": 8.404332553264548e-06, | |
| "loss": 0.6351, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.0113314447592068, | |
| "grad_norm": 2.0125074643954024, | |
| "learning_rate": 8.355492141795185e-06, | |
| "loss": 0.5146, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.0226628895184136, | |
| "grad_norm": 1.1306298266109818, | |
| "learning_rate": 8.30606213650922e-06, | |
| "loss": 0.497, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.0339943342776203, | |
| "grad_norm": 1.0714951468489908, | |
| "learning_rate": 8.256051222757188e-06, | |
| "loss": 0.4921, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.045325779036827, | |
| "grad_norm": 0.9830972246185706, | |
| "learning_rate": 8.2054681879611e-06, | |
| "loss": 0.4906, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.056657223796034, | |
| "grad_norm": 0.9632332800113752, | |
| "learning_rate": 8.154321920070415e-06, | |
| "loss": 0.4657, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.0679886685552409, | |
| "grad_norm": 1.193395200214797, | |
| "learning_rate": 8.10262140600031e-06, | |
| "loss": 0.4861, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.0793201133144477, | |
| "grad_norm": 1.28865060369019, | |
| "learning_rate": 8.050375730052622e-06, | |
| "loss": 0.5093, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.0906515580736544, | |
| "grad_norm": 1.247161113611643, | |
| "learning_rate": 7.997594072319625e-06, | |
| "loss": 0.504, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.1019830028328612, | |
| "grad_norm": 1.1321908951225559, | |
| "learning_rate": 7.944285707070999e-06, | |
| "loss": 0.514, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.113314447592068, | |
| "grad_norm": 1.097294675331813, | |
| "learning_rate": 7.890460001124242e-06, | |
| "loss": 0.5074, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.1246458923512748, | |
| "grad_norm": 1.1106766243842143, | |
| "learning_rate": 7.836126412198842e-06, | |
| "loss": 0.495, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.1359773371104815, | |
| "grad_norm": 1.0781028414115594, | |
| "learning_rate": 7.781294487254436e-06, | |
| "loss": 0.4917, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.1473087818696883, | |
| "grad_norm": 1.0597834799331805, | |
| "learning_rate": 7.725973860813338e-06, | |
| "loss": 0.4953, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.158640226628895, | |
| "grad_norm": 1.075317244066298, | |
| "learning_rate": 7.67017425326764e-06, | |
| "loss": 0.4985, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.1699716713881019, | |
| "grad_norm": 1.126814415152867, | |
| "learning_rate": 7.613905469171247e-06, | |
| "loss": 0.4869, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.1813031161473089, | |
| "grad_norm": 1.0228965180222989, | |
| "learning_rate": 7.5571773955171124e-06, | |
| "loss": 0.4956, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.1926345609065157, | |
| "grad_norm": 1.0496260656765666, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.4804, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.2039660056657224, | |
| "grad_norm": 1.057021462285616, | |
| "learning_rate": 7.442383329265063e-06, | |
| "loss": 0.4802, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.2152974504249292, | |
| "grad_norm": 1.0586760394529304, | |
| "learning_rate": 7.3843375071425315e-06, | |
| "loss": 0.4755, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.226628895184136, | |
| "grad_norm": 1.098164645835599, | |
| "learning_rate": 7.32587273286887e-06, | |
| "loss": 0.4806, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.2379603399433428, | |
| "grad_norm": 1.0425540537419706, | |
| "learning_rate": 7.2669992792946595e-06, | |
| "loss": 0.4976, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.2492917847025495, | |
| "grad_norm": 1.0081778159600596, | |
| "learning_rate": 7.2077274910795605e-06, | |
| "loss": 0.4775, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.2606232294617563, | |
| "grad_norm": 1.0426051895523285, | |
| "learning_rate": 7.14806778287464e-06, | |
| "loss": 0.4948, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.271954674220963, | |
| "grad_norm": 1.0491543765702032, | |
| "learning_rate": 7.088030637492429e-06, | |
| "loss": 0.5198, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.28328611898017, | |
| "grad_norm": 1.0120042186636362, | |
| "learning_rate": 7.02762660406497e-06, | |
| "loss": 0.5032, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.2946175637393766, | |
| "grad_norm": 1.0538656654185354, | |
| "learning_rate": 6.966866296190243e-06, | |
| "loss": 0.4835, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.3059490084985836, | |
| "grad_norm": 0.983675200448248, | |
| "learning_rate": 6.9057603900672355e-06, | |
| "loss": 0.4469, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.3172804532577904, | |
| "grad_norm": 1.1103412550285476, | |
| "learning_rate": 6.844319622620039e-06, | |
| "loss": 0.5124, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.3286118980169972, | |
| "grad_norm": 1.064307096238654, | |
| "learning_rate": 6.782554789611256e-06, | |
| "loss": 0.4943, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.339943342776204, | |
| "grad_norm": 1.0325281954877101, | |
| "learning_rate": 6.7204767437450725e-06, | |
| "loss": 0.4703, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.3512747875354107, | |
| "grad_norm": 1.0365628583864628, | |
| "learning_rate": 6.65809639276034e-06, | |
| "loss": 0.494, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.3626062322946175, | |
| "grad_norm": 1.0482388627399757, | |
| "learning_rate": 6.595424697513963e-06, | |
| "loss": 0.4502, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.3739376770538243, | |
| "grad_norm": 1.0272064142818405, | |
| "learning_rate": 6.532472670054975e-06, | |
| "loss": 0.492, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.385269121813031, | |
| "grad_norm": 1.0810272879082132, | |
| "learning_rate": 6.469251371689606e-06, | |
| "loss": 0.4847, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.3966005665722379, | |
| "grad_norm": 1.0366197600921454, | |
| "learning_rate": 6.405771911037698e-06, | |
| "loss": 0.4999, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.4079320113314449, | |
| "grad_norm": 1.0295069364200777, | |
| "learning_rate": 6.342045442080818e-06, | |
| "loss": 0.4783, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.4192634560906516, | |
| "grad_norm": 1.0528763013327969, | |
| "learning_rate": 6.278083162202374e-06, | |
| "loss": 0.4846, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.4305949008498584, | |
| "grad_norm": 1.0734593139015471, | |
| "learning_rate": 6.21389631022014e-06, | |
| "loss": 0.5134, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.4419263456090652, | |
| "grad_norm": 1.0207282551843653, | |
| "learning_rate": 6.1494961644114685e-06, | |
| "loss": 0.4855, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.453257790368272, | |
| "grad_norm": 0.9713494112903828, | |
| "learning_rate": 6.084894040531591e-06, | |
| "loss": 0.4667, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.4645892351274787, | |
| "grad_norm": 1.1036048289558185, | |
| "learning_rate": 6.0201012898253244e-06, | |
| "loss": 0.4905, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.4759206798866855, | |
| "grad_norm": 0.996202225854195, | |
| "learning_rate": 5.9551292970325394e-06, | |
| "loss": 0.4746, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.4872521246458923, | |
| "grad_norm": 1.0919133151119662, | |
| "learning_rate": 5.8899894783877536e-06, | |
| "loss": 0.5201, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.498583569405099, | |
| "grad_norm": 1.11280141387768, | |
| "learning_rate": 5.824693279614171e-06, | |
| "loss": 0.4953, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.509915014164306, | |
| "grad_norm": 1.1163217052046956, | |
| "learning_rate": 5.759252173912573e-06, | |
| "loss": 0.481, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.5212464589235126, | |
| "grad_norm": 1.0688323988028812, | |
| "learning_rate": 5.693677659945343e-06, | |
| "loss": 0.4711, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.5325779036827196, | |
| "grad_norm": 0.9512892167994508, | |
| "learning_rate": 5.627981259816041e-06, | |
| "loss": 0.4697, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.5439093484419264, | |
| "grad_norm": 1.0157798339830766, | |
| "learning_rate": 5.562174517044862e-06, | |
| "loss": 0.4728, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.5552407932011332, | |
| "grad_norm": 0.9982778169224142, | |
| "learning_rate": 5.496268994540309e-06, | |
| "loss": 0.453, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.56657223796034, | |
| "grad_norm": 1.1297738773397445, | |
| "learning_rate": 5.430276272567485e-06, | |
| "loss": 0.495, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.5779036827195467, | |
| "grad_norm": 1.0139903899310507, | |
| "learning_rate": 5.364207946713318e-06, | |
| "loss": 0.4844, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.5892351274787535, | |
| "grad_norm": 0.9490126458491319, | |
| "learning_rate": 5.2980756258491e-06, | |
| "loss": 0.4632, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.6005665722379603, | |
| "grad_norm": 0.9789924111916612, | |
| "learning_rate": 5.231890930090692e-06, | |
| "loss": 0.4641, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.6118980169971673, | |
| "grad_norm": 1.0001930502458516, | |
| "learning_rate": 5.165665488756755e-06, | |
| "loss": 0.4511, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.6232294617563738, | |
| "grad_norm": 1.0412278168604834, | |
| "learning_rate": 5.099410938325351e-06, | |
| "loss": 0.4813, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.6345609065155808, | |
| "grad_norm": 2.990945647537025, | |
| "learning_rate": 5.033138920389313e-06, | |
| "loss": 0.4949, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.6458923512747874, | |
| "grad_norm": 0.9622418163601026, | |
| "learning_rate": 4.966861079610688e-06, | |
| "loss": 0.4855, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.6572237960339944, | |
| "grad_norm": 1.0030167678640822, | |
| "learning_rate": 4.900589061674649e-06, | |
| "loss": 0.4589, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.6685552407932012, | |
| "grad_norm": 1.0109166766299091, | |
| "learning_rate": 4.8343345112432475e-06, | |
| "loss": 0.4778, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.679886685552408, | |
| "grad_norm": 1.0402771028968805, | |
| "learning_rate": 4.7681090699093076e-06, | |
| "loss": 0.4874, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.6912181303116147, | |
| "grad_norm": 1.0333160217244122, | |
| "learning_rate": 4.701924374150901e-06, | |
| "loss": 0.469, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.7025495750708215, | |
| "grad_norm": 1.0264878278726923, | |
| "learning_rate": 4.635792053286682e-06, | |
| "loss": 0.477, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.7138810198300283, | |
| "grad_norm": 0.9806277129349131, | |
| "learning_rate": 4.569723727432517e-06, | |
| "loss": 0.4609, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.725212464589235, | |
| "grad_norm": 1.0430109649067774, | |
| "learning_rate": 4.5037310054596936e-06, | |
| "loss": 0.4852, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.736543909348442, | |
| "grad_norm": 1.0177412955604808, | |
| "learning_rate": 4.43782548295514e-06, | |
| "loss": 0.4538, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.7478753541076486, | |
| "grad_norm": 1.0742221754801993, | |
| "learning_rate": 4.372018740183961e-06, | |
| "loss": 0.502, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.7592067988668556, | |
| "grad_norm": 1.2114594760413002, | |
| "learning_rate": 4.30632234005466e-06, | |
| "loss": 0.4626, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.7705382436260622, | |
| "grad_norm": 1.0105219104936058, | |
| "learning_rate": 4.2407478260874294e-06, | |
| "loss": 0.4443, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.7818696883852692, | |
| "grad_norm": 1.0676939421912321, | |
| "learning_rate": 4.175306720385831e-06, | |
| "loss": 0.461, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.793201133144476, | |
| "grad_norm": 1.0843360976121068, | |
| "learning_rate": 4.11001052161225e-06, | |
| "loss": 0.4562, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.8045325779036827, | |
| "grad_norm": 1.0182445190909426, | |
| "learning_rate": 4.044870702967461e-06, | |
| "loss": 0.4597, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.8158640226628895, | |
| "grad_norm": 1.0266398146802735, | |
| "learning_rate": 3.979898710174678e-06, | |
| "loss": 0.4737, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.8271954674220963, | |
| "grad_norm": 1.0375307407230006, | |
| "learning_rate": 3.91510595946841e-06, | |
| "loss": 0.476, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.8385269121813033, | |
| "grad_norm": 1.0510195116895713, | |
| "learning_rate": 3.850503835588533e-06, | |
| "loss": 0.4572, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.8498583569405098, | |
| "grad_norm": 1.0707576258473916, | |
| "learning_rate": 3.786103689779861e-06, | |
| "loss": 0.4855, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.8611898016997168, | |
| "grad_norm": 1.109879197789788, | |
| "learning_rate": 3.721916837797627e-06, | |
| "loss": 0.4744, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.8725212464589234, | |
| "grad_norm": 0.9430434127126872, | |
| "learning_rate": 3.6579545579191834e-06, | |
| "loss": 0.5036, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.8838526912181304, | |
| "grad_norm": 1.0454617136926816, | |
| "learning_rate": 3.5942280889623028e-06, | |
| "loss": 0.4757, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.8951841359773371, | |
| "grad_norm": 0.9669993221473043, | |
| "learning_rate": 3.5307486283103966e-06, | |
| "loss": 0.4939, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.906515580736544, | |
| "grad_norm": 1.1489596332179548, | |
| "learning_rate": 3.4675273299450264e-06, | |
| "loss": 0.4875, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.9178470254957507, | |
| "grad_norm": 1.236638321882873, | |
| "learning_rate": 3.4045753024860393e-06, | |
| "loss": 0.4899, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.9291784702549575, | |
| "grad_norm": 1.0015067232304347, | |
| "learning_rate": 3.3419036072396614e-06, | |
| "loss": 0.4367, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.9405099150141643, | |
| "grad_norm": 0.991139662986458, | |
| "learning_rate": 3.2795232562549296e-06, | |
| "loss": 0.4593, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.951841359773371, | |
| "grad_norm": 1.0171228373147831, | |
| "learning_rate": 3.2174452103887455e-06, | |
| "loss": 0.4864, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.963172804532578, | |
| "grad_norm": 1.0183503025841374, | |
| "learning_rate": 3.1556803773799616e-06, | |
| "loss": 0.4775, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.9745042492917846, | |
| "grad_norm": 0.9658158834425475, | |
| "learning_rate": 3.0942396099327645e-06, | |
| "loss": 0.4628, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.9858356940509916, | |
| "grad_norm": 1.0046391704473616, | |
| "learning_rate": 3.03313370380976e-06, | |
| "loss": 0.4945, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.9971671388101981, | |
| "grad_norm": 0.9746868290860945, | |
| "learning_rate": 2.972373395935031e-06, | |
| "loss": 0.4384, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.9746868290860945, | |
| "learning_rate": 2.911969362507574e-06, | |
| "loss": 0.4562, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 2.011331444759207, | |
| "grad_norm": 2.1634786845934584, | |
| "learning_rate": 2.8519322171253605e-06, | |
| "loss": 0.3576, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 2.0226628895184136, | |
| "grad_norm": 1.259167809122465, | |
| "learning_rate": 2.792272508920443e-06, | |
| "loss": 0.3306, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.0339943342776206, | |
| "grad_norm": 1.3388873888110011, | |
| "learning_rate": 2.7330007207053413e-06, | |
| "loss": 0.353, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 2.045325779036827, | |
| "grad_norm": 1.1581849151502048, | |
| "learning_rate": 2.674127267131131e-06, | |
| "loss": 0.3317, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 2.056657223796034, | |
| "grad_norm": 1.0160032268336192, | |
| "learning_rate": 2.615662492857471e-06, | |
| "loss": 0.3581, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 2.0679886685552407, | |
| "grad_norm": 1.0233678646861728, | |
| "learning_rate": 2.5576166707349387e-06, | |
| "loss": 0.3359, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 2.0793201133144477, | |
| "grad_norm": 1.0874679159300038, | |
| "learning_rate": 2.5000000000000015e-06, | |
| "loss": 0.3219, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 2.090651558073654, | |
| "grad_norm": 1.2469998353736902, | |
| "learning_rate": 2.4428226044828896e-06, | |
| "loss": 0.3271, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 2.101983002832861, | |
| "grad_norm": 1.1847806975199535, | |
| "learning_rate": 2.3860945308287554e-06, | |
| "loss": 0.3429, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 2.113314447592068, | |
| "grad_norm": 1.3829661881977866, | |
| "learning_rate": 2.3298257467323605e-06, | |
| "loss": 0.3492, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 2.1246458923512748, | |
| "grad_norm": 1.1118666347289263, | |
| "learning_rate": 2.2740261391866634e-06, | |
| "loss": 0.3343, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 2.1359773371104818, | |
| "grad_norm": 1.1295786044065697, | |
| "learning_rate": 2.2187055127455653e-06, | |
| "loss": 0.3306, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.1473087818696883, | |
| "grad_norm": 1.3950194361496737, | |
| "learning_rate": 2.1638735878011603e-06, | |
| "loss": 0.3515, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 2.1586402266288953, | |
| "grad_norm": 1.1337210762125438, | |
| "learning_rate": 2.1095399988757574e-06, | |
| "loss": 0.3201, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 2.169971671388102, | |
| "grad_norm": 1.059433716116878, | |
| "learning_rate": 2.0557142929290027e-06, | |
| "loss": 0.3526, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 2.181303116147309, | |
| "grad_norm": 1.0876920114742847, | |
| "learning_rate": 2.0024059276803742e-06, | |
| "loss": 0.3275, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 2.1926345609065154, | |
| "grad_norm": 1.136528776323311, | |
| "learning_rate": 1.949624269947378e-06, | |
| "loss": 0.3499, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 2.2039660056657224, | |
| "grad_norm": 1.1195654060494844, | |
| "learning_rate": 1.897378593999693e-06, | |
| "loss": 0.3105, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 2.215297450424929, | |
| "grad_norm": 1.0686107201673802, | |
| "learning_rate": 1.8456780799295888e-06, | |
| "loss": 0.3409, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 2.226628895184136, | |
| "grad_norm": 1.1176135978118285, | |
| "learning_rate": 1.794531812038901e-06, | |
| "loss": 0.3242, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 2.237960339943343, | |
| "grad_norm": 1.1225522593354427, | |
| "learning_rate": 1.7439487772428142e-06, | |
| "loss": 0.3331, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 2.2492917847025495, | |
| "grad_norm": 1.0504526826797216, | |
| "learning_rate": 1.6939378634907815e-06, | |
| "loss": 0.3223, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.2606232294617565, | |
| "grad_norm": 1.0112717450368687, | |
| "learning_rate": 1.6445078582048158e-06, | |
| "loss": 0.3328, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 2.271954674220963, | |
| "grad_norm": 1.0056815807805697, | |
| "learning_rate": 1.5956674467354538e-06, | |
| "loss": 0.3349, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 2.28328611898017, | |
| "grad_norm": 1.0324761445382153, | |
| "learning_rate": 1.5474252108356475e-06, | |
| "loss": 0.3147, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 2.2946175637393766, | |
| "grad_norm": 3.0756191856725437, | |
| "learning_rate": 1.499789627152874e-06, | |
| "loss": 0.3148, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 2.3059490084985836, | |
| "grad_norm": 1.14933836933374, | |
| "learning_rate": 1.452769065739688e-06, | |
| "loss": 0.3487, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.31728045325779, | |
| "grad_norm": 0.9691075451255097, | |
| "learning_rate": 1.4063717885830375e-06, | |
| "loss": 0.3216, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 2.328611898016997, | |
| "grad_norm": 1.2745727227347767, | |
| "learning_rate": 1.3606059481525296e-06, | |
| "loss": 0.3585, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 2.3399433427762037, | |
| "grad_norm": 0.9868916262509804, | |
| "learning_rate": 1.3154795859679781e-06, | |
| "loss": 0.3416, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 2.3512747875354107, | |
| "grad_norm": 1.029329657926381, | |
| "learning_rate": 1.2710006311864104e-06, | |
| "loss": 0.3438, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 2.3626062322946177, | |
| "grad_norm": 1.2301937202365874, | |
| "learning_rate": 1.227176899208849e-06, | |
| "loss": 0.3232, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.3739376770538243, | |
| "grad_norm": 1.1079694734813215, | |
| "learning_rate": 1.1840160903070591e-06, | |
| "loss": 0.3533, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 2.3852691218130313, | |
| "grad_norm": 1.0355467829487406, | |
| "learning_rate": 1.141525788270531e-06, | |
| "loss": 0.3455, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 2.396600566572238, | |
| "grad_norm": 1.0285862271263877, | |
| "learning_rate": 1.09971345907394e-06, | |
| "loss": 0.2994, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 2.407932011331445, | |
| "grad_norm": 1.0633893519411577, | |
| "learning_rate": 1.0585864495652899e-06, | |
| "loss": 0.3386, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 2.4192634560906514, | |
| "grad_norm": 0.985097331489618, | |
| "learning_rate": 1.0181519861750078e-06, | |
| "loss": 0.3181, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.4305949008498584, | |
| "grad_norm": 0.951413780263271, | |
| "learning_rate": 9.784171736461762e-07, | |
| "loss": 0.3105, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 2.441926345609065, | |
| "grad_norm": 1.0282273427987358, | |
| "learning_rate": 9.393889937861694e-07, | |
| "loss": 0.3179, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 2.453257790368272, | |
| "grad_norm": 1.026529941608791, | |
| "learning_rate": 9.010743042398684e-07, | |
| "loss": 0.3234, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 2.4645892351274785, | |
| "grad_norm": 1.0299442438320148, | |
| "learning_rate": 8.634798372847148e-07, | |
| "loss": 0.335, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 2.4759206798866855, | |
| "grad_norm": 0.9309231031132973, | |
| "learning_rate": 8.266121986477699e-07, | |
| "loss": 0.318, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.4872521246458925, | |
| "grad_norm": 1.0062159661580126, | |
| "learning_rate": 7.904778663450325e-07, | |
| "loss": 0.3292, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 2.498583569405099, | |
| "grad_norm": 1.0354919361102888, | |
| "learning_rate": 7.550831895431799e-07, | |
| "loss": 0.3266, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 2.509915014164306, | |
| "grad_norm": 0.9693415538045153, | |
| "learning_rate": 7.204343874439578e-07, | |
| "loss": 0.3282, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 2.5212464589235126, | |
| "grad_norm": 1.0178821797615285, | |
| "learning_rate": 6.865375481914017e-07, | |
| "loss": 0.3561, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 2.5325779036827196, | |
| "grad_norm": 1.0271091771586642, | |
| "learning_rate": 6.533986278020876e-07, | |
| "loss": 0.3064, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.543909348441926, | |
| "grad_norm": 0.9930205073186488, | |
| "learning_rate": 6.210234491186079e-07, | |
| "loss": 0.318, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 2.555240793201133, | |
| "grad_norm": 1.015466323155115, | |
| "learning_rate": 5.894177007864272e-07, | |
| "loss": 0.3408, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 2.56657223796034, | |
| "grad_norm": 1.065785552873228, | |
| "learning_rate": 5.585869362543416e-07, | |
| "loss": 0.3414, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 2.5779036827195467, | |
| "grad_norm": 1.0524927446179813, | |
| "learning_rate": 5.285365727986708e-07, | |
| "loss": 0.3422, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 2.5892351274787533, | |
| "grad_norm": 1.0219196548167786, | |
| "learning_rate": 4.992718905713967e-07, | |
| "loss": 0.3388, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.6005665722379603, | |
| "grad_norm": 0.9679912813387603, | |
| "learning_rate": 4.707980316723837e-07, | |
| "loss": 0.3165, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 2.6118980169971673, | |
| "grad_norm": 0.9893500327460035, | |
| "learning_rate": 4.431199992458607e-07, | |
| "loss": 0.3238, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 2.623229461756374, | |
| "grad_norm": 0.9876579686339385, | |
| "learning_rate": 4.16242656601315e-07, | |
| "loss": 0.308, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 2.634560906515581, | |
| "grad_norm": 1.01213916356771, | |
| "learning_rate": 3.9017072635896716e-07, | |
| "loss": 0.331, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 2.6458923512747874, | |
| "grad_norm": 1.0151577294613559, | |
| "learning_rate": 3.649087896199488e-07, | |
| "loss": 0.3098, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.6572237960339944, | |
| "grad_norm": 0.9854787297770221, | |
| "learning_rate": 3.404612851613676e-07, | |
| "loss": 0.3202, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 2.668555240793201, | |
| "grad_norm": 2.5197939583747866, | |
| "learning_rate": 3.168325086563612e-07, | |
| "loss": 0.3302, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 2.679886685552408, | |
| "grad_norm": 0.9681009670329549, | |
| "learning_rate": 2.9402661191930803e-07, | |
| "loss": 0.3221, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 2.691218130311615, | |
| "grad_norm": 1.0155833734622453, | |
| "learning_rate": 2.7204760217631074e-07, | |
| "loss": 0.324, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 2.7025495750708215, | |
| "grad_norm": 1.1931982505904983, | |
| "learning_rate": 2.5089934136108665e-07, | |
| "loss": 0.3327, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.713881019830028, | |
| "grad_norm": 0.9735860788683143, | |
| "learning_rate": 2.30585545436387e-07, | |
| "loss": 0.3483, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 2.725212464589235, | |
| "grad_norm": 0.9628214952166717, | |
| "learning_rate": 2.1110978374106195e-07, | |
| "loss": 0.3455, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 2.736543909348442, | |
| "grad_norm": 1.4367674984114238, | |
| "learning_rate": 1.9247547836289792e-07, | |
| "loss": 0.3565, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 2.7478753541076486, | |
| "grad_norm": 1.0738794442822241, | |
| "learning_rate": 1.7468590353731495e-07, | |
| "loss": 0.3577, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 2.7592067988668556, | |
| "grad_norm": 1.0163993435166494, | |
| "learning_rate": 1.577441850720568e-07, | |
| "loss": 0.3346, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.770538243626062, | |
| "grad_norm": 1.1268283470669345, | |
| "learning_rate": 1.4165329979794972e-07, | |
| "loss": 0.3204, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 2.781869688385269, | |
| "grad_norm": 1.00412302366148, | |
| "learning_rate": 1.264160750458493e-07, | |
| "loss": 0.3091, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 2.7932011331444757, | |
| "grad_norm": 1.0878323463224275, | |
| "learning_rate": 1.1203518814984216e-07, | |
| "loss": 0.3219, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 2.8045325779036827, | |
| "grad_norm": 1.0326844241286977, | |
| "learning_rate": 9.851316597681959e-08, | |
| "loss": 0.3407, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 2.8158640226628897, | |
| "grad_norm": 1.0488660487318535, | |
| "learning_rate": 8.585238448247434e-08, | |
| "loss": 0.3066, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.8271954674220963, | |
| "grad_norm": 0.9440222402450956, | |
| "learning_rate": 7.405506829382736e-08, | |
| "loss": 0.3007, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 2.8385269121813033, | |
| "grad_norm": 0.9992965787158642, | |
| "learning_rate": 6.31232903183332e-08, | |
| "loss": 0.3211, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.84985835694051, | |
| "grad_norm": 1.0525889142182898, | |
| "learning_rate": 5.305897137965199e-08, | |
| "loss": 0.3339, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 2.861189801699717, | |
| "grad_norm": 1.0406232501867803, | |
| "learning_rate": 4.3863879880142737e-08, | |
| "loss": 0.3188, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 2.8725212464589234, | |
| "grad_norm": 1.0108504238438418, | |
| "learning_rate": 3.553963149013295e-08, | |
| "loss": 0.3426, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.8838526912181304, | |
| "grad_norm": 1.040975846702501, | |
| "learning_rate": 2.8087688864033014e-08, | |
| "loss": 0.3365, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 2.8951841359773374, | |
| "grad_norm": 1.0279134406587973, | |
| "learning_rate": 2.1509361383330597e-08, | |
| "loss": 0.3167, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 2.906515580736544, | |
| "grad_norm": 1.0127896976081647, | |
| "learning_rate": 1.580580492652084e-08, | |
| "loss": 0.3589, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 2.9178470254957505, | |
| "grad_norm": 1.002944001928922, | |
| "learning_rate": 1.0978021666005479e-08, | |
| "loss": 0.3382, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 2.9291784702549575, | |
| "grad_norm": 0.9936405641782646, | |
| "learning_rate": 7.02685989200258e-09, | |
| "loss": 0.3373, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.9405099150141645, | |
| "grad_norm": 1.0916598818224916, | |
| "learning_rate": 3.953013863490784e-09, | |
| "loss": 0.3124, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 2.951841359773371, | |
| "grad_norm": 0.9881063904383428, | |
| "learning_rate": 1.757023686224102e-09, | |
| "loss": 0.3401, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 2.963172804532578, | |
| "grad_norm": 0.9953124861905701, | |
| "learning_rate": 4.392752178278281e-10, | |
| "loss": 0.3202, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 2.9745042492917846, | |
| "grad_norm": 0.9957928075882635, | |
| "learning_rate": 0.0, | |
| "loss": 0.299, | |
| "step": 264 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 264, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 72196646453248.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |