{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9745042492917846, "eval_steps": 500, "global_step": 264, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0113314447592068, "grad_norm": 23.53424926746809, "learning_rate": 3.7037037037037036e-07, "loss": 1.4032, "step": 1 }, { "epoch": 0.0226628895184136, "grad_norm": 23.298712049647957, "learning_rate": 7.407407407407407e-07, "loss": 1.4235, "step": 2 }, { "epoch": 0.0339943342776204, "grad_norm": 25.15418991437702, "learning_rate": 1.111111111111111e-06, "loss": 1.3918, "step": 3 }, { "epoch": 0.0453257790368272, "grad_norm": 23.513705846381587, "learning_rate": 1.4814814814814815e-06, "loss": 1.3583, "step": 4 }, { "epoch": 0.056657223796033995, "grad_norm": 18.732779423937743, "learning_rate": 1.8518518518518519e-06, "loss": 1.2546, "step": 5 }, { "epoch": 0.0679886685552408, "grad_norm": 19.557080948103327, "learning_rate": 2.222222222222222e-06, "loss": 1.2998, "step": 6 }, { "epoch": 0.07932011331444759, "grad_norm": 10.724009151468064, "learning_rate": 2.5925925925925925e-06, "loss": 1.2226, "step": 7 }, { "epoch": 0.0906515580736544, "grad_norm": 4.775800267490731, "learning_rate": 2.962962962962963e-06, "loss": 1.0559, "step": 8 }, { "epoch": 0.10198300283286119, "grad_norm": 3.94648433253879, "learning_rate": 3.3333333333333333e-06, "loss": 1.0475, "step": 9 }, { "epoch": 0.11331444759206799, "grad_norm": 3.2410802138408448, "learning_rate": 3.7037037037037037e-06, "loss": 0.9999, "step": 10 }, { "epoch": 0.12464589235127478, "grad_norm": 3.409262196892178, "learning_rate": 4.074074074074074e-06, "loss": 0.9393, "step": 11 }, { "epoch": 0.1359773371104816, "grad_norm": 2.7175044653926625, "learning_rate": 4.444444444444444e-06, "loss": 0.9487, "step": 12 }, { "epoch": 0.14730878186968838, "grad_norm": 2.3048951821321078, "learning_rate": 4.814814814814815e-06, "loss": 0.9302, "step": 13 }, { "epoch": 0.15864022662889518, "grad_norm": 2.172665075682734, "learning_rate": 5.185185185185185e-06, "loss": 0.9152, "step": 14 }, { "epoch": 0.16997167138810199, "grad_norm": 2.1574147876066445, "learning_rate": 5.555555555555557e-06, "loss": 0.828, "step": 15 }, { "epoch": 0.1813031161473088, "grad_norm": 2.097885575557383, "learning_rate": 5.925925925925926e-06, "loss": 0.8426, "step": 16 }, { "epoch": 0.19263456090651557, "grad_norm": 1.7637510926108797, "learning_rate": 6.296296296296297e-06, "loss": 0.8096, "step": 17 }, { "epoch": 0.20396600566572237, "grad_norm": 1.6562239272452715, "learning_rate": 6.666666666666667e-06, "loss": 0.838, "step": 18 }, { "epoch": 0.21529745042492918, "grad_norm": 1.4205229302221682, "learning_rate": 7.0370370370370375e-06, "loss": 0.7763, "step": 19 }, { "epoch": 0.22662889518413598, "grad_norm": 1.4262379616902559, "learning_rate": 7.4074074074074075e-06, "loss": 0.7698, "step": 20 }, { "epoch": 0.23796033994334279, "grad_norm": 1.6494892959766825, "learning_rate": 7.77777777777778e-06, "loss": 0.7665, "step": 21 }, { "epoch": 0.24929178470254956, "grad_norm": 1.4334685604983732, "learning_rate": 8.148148148148148e-06, "loss": 0.7822, "step": 22 }, { "epoch": 0.26062322946175637, "grad_norm": 1.3849818905239097, "learning_rate": 8.518518518518519e-06, "loss": 0.7283, "step": 23 }, { "epoch": 0.2719546742209632, "grad_norm": 1.341658865495544, "learning_rate": 8.888888888888888e-06, "loss": 0.6999, "step": 24 }, { "epoch": 0.28328611898017, "grad_norm": 1.30493584130229, "learning_rate": 9.25925925925926e-06, "loss": 0.7367, "step": 25 }, { "epoch": 0.29461756373937675, "grad_norm": 1.400982166207809, "learning_rate": 9.62962962962963e-06, "loss": 0.7212, "step": 26 }, { "epoch": 0.3059490084985836, "grad_norm": 1.2376220091039114, "learning_rate": 1e-05, "loss": 0.7211, "step": 27 }, { "epoch": 0.31728045325779036, "grad_norm": 1.3155466485701666, "learning_rate": 9.999560724782173e-06, "loss": 0.7194, "step": 28 }, { "epoch": 0.3286118980169972, "grad_norm": 1.1452371029975463, "learning_rate": 9.998242976313777e-06, "loss": 0.7205, "step": 29 }, { "epoch": 0.33994334277620397, "grad_norm": 1.166971440865611, "learning_rate": 9.99604698613651e-06, "loss": 0.7097, "step": 30 }, { "epoch": 0.35127478753541075, "grad_norm": 1.3261901880488491, "learning_rate": 9.992973140107998e-06, "loss": 0.6974, "step": 31 }, { "epoch": 0.3626062322946176, "grad_norm": 1.2464894375774034, "learning_rate": 9.989021978333996e-06, "loss": 0.7082, "step": 32 }, { "epoch": 0.37393767705382436, "grad_norm": 1.1355726846388239, "learning_rate": 9.98419419507348e-06, "loss": 0.6734, "step": 33 }, { "epoch": 0.38526912181303113, "grad_norm": 1.0490944265922426, "learning_rate": 9.978490638616671e-06, "loss": 0.6853, "step": 34 }, { "epoch": 0.39660056657223797, "grad_norm": 1.2600398341735712, "learning_rate": 9.971912311135967e-06, "loss": 0.6703, "step": 35 }, { "epoch": 0.40793201133144474, "grad_norm": 1.1344614916090783, "learning_rate": 9.964460368509868e-06, "loss": 0.6841, "step": 36 }, { "epoch": 0.4192634560906516, "grad_norm": 1.1274773706270436, "learning_rate": 9.956136120119858e-06, "loss": 0.6817, "step": 37 }, { "epoch": 0.43059490084985835, "grad_norm": 1.2113014190849545, "learning_rate": 9.946941028620349e-06, "loss": 0.6837, "step": 38 }, { "epoch": 0.44192634560906513, "grad_norm": 1.172097274900492, "learning_rate": 9.936876709681668e-06, "loss": 0.6678, "step": 39 }, { "epoch": 0.45325779036827196, "grad_norm": 1.2490078546533159, "learning_rate": 9.925944931706174e-06, "loss": 0.7413, "step": 40 }, { "epoch": 0.46458923512747874, "grad_norm": 1.140086857717275, "learning_rate": 9.914147615517527e-06, "loss": 0.6778, "step": 41 }, { "epoch": 0.47592067988668557, "grad_norm": 1.2223423143241732, "learning_rate": 9.901486834023182e-06, "loss": 0.7388, "step": 42 }, { "epoch": 0.48725212464589235, "grad_norm": 1.2452223599483243, "learning_rate": 9.887964811850159e-06, "loss": 0.691, "step": 43 }, { "epoch": 0.4985835694050991, "grad_norm": 1.1350515055455908, "learning_rate": 9.873583924954152e-06, "loss": 0.6593, "step": 44 }, { "epoch": 0.509915014164306, "grad_norm": 1.0405719380983063, "learning_rate": 9.85834670020205e-06, "loss": 0.6351, "step": 45 }, { "epoch": 0.5212464589235127, "grad_norm": 1.3303109757890985, "learning_rate": 9.842255814927945e-06, "loss": 0.6404, "step": 46 }, { "epoch": 0.5325779036827195, "grad_norm": 1.1787017310861478, "learning_rate": 9.825314096462686e-06, "loss": 0.6858, "step": 47 }, { "epoch": 0.5439093484419264, "grad_norm": 1.1028621034116284, "learning_rate": 9.807524521637103e-06, "loss": 0.6554, "step": 48 }, { "epoch": 0.5552407932011332, "grad_norm": 1.0192876247663198, "learning_rate": 9.78889021625894e-06, "loss": 0.6581, "step": 49 }, { "epoch": 0.56657223796034, "grad_norm": 1.0981773991994468, "learning_rate": 9.769414454563614e-06, "loss": 0.6873, "step": 50 }, { "epoch": 0.5779036827195467, "grad_norm": 1.080964680948062, "learning_rate": 9.749100658638914e-06, "loss": 0.6313, "step": 51 }, { "epoch": 0.5892351274787535, "grad_norm": 1.060635241593271, "learning_rate": 9.72795239782369e-06, "loss": 0.657, "step": 52 }, { "epoch": 0.6005665722379604, "grad_norm": 1.1436681010237095, "learning_rate": 9.705973388080694e-06, "loss": 0.6521, "step": 53 }, { "epoch": 0.6118980169971672, "grad_norm": 1.0838029458150678, "learning_rate": 9.68316749134364e-06, "loss": 0.6712, "step": 54 }, { "epoch": 0.623229461756374, "grad_norm": 1.0579456798759823, "learning_rate": 9.659538714838635e-06, "loss": 0.6439, "step": 55 }, { "epoch": 0.6345609065155807, "grad_norm": 1.000408593357701, "learning_rate": 9.635091210380052e-06, "loss": 0.6164, "step": 56 }, { "epoch": 0.6458923512747875, "grad_norm": 1.0871122101771147, "learning_rate": 9.609829273641034e-06, "loss": 0.6561, "step": 57 }, { "epoch": 0.6572237960339944, "grad_norm": 1.0392258903623652, "learning_rate": 9.583757343398685e-06, "loss": 0.6353, "step": 58 }, { "epoch": 0.6685552407932012, "grad_norm": 1.0694855168162771, "learning_rate": 9.55688000075414e-06, "loss": 0.672, "step": 59 }, { "epoch": 0.6798866855524079, "grad_norm": 1.0818048041242603, "learning_rate": 9.529201968327618e-06, "loss": 0.6649, "step": 60 }, { "epoch": 0.6912181303116147, "grad_norm": 1.122154267801109, "learning_rate": 9.500728109428603e-06, "loss": 0.6338, "step": 61 }, { "epoch": 0.7025495750708215, "grad_norm": 1.0115716268572774, "learning_rate": 9.47146342720133e-06, "loss": 0.6404, "step": 62 }, { "epoch": 0.7138810198300283, "grad_norm": 1.060628179091387, "learning_rate": 9.44141306374566e-06, "loss": 0.6491, "step": 63 }, { "epoch": 0.7252124645892352, "grad_norm": 1.0433876035374046, "learning_rate": 9.410582299213574e-06, "loss": 0.6131, "step": 64 }, { "epoch": 0.7365439093484419, "grad_norm": 1.0724446453489962, "learning_rate": 9.378976550881393e-06, "loss": 0.645, "step": 65 }, { "epoch": 0.7478753541076487, "grad_norm": 16.698318216158572, "learning_rate": 9.346601372197914e-06, "loss": 0.628, "step": 66 }, { "epoch": 0.7592067988668555, "grad_norm": 1.088611623094774, "learning_rate": 9.3134624518086e-06, "loss": 0.651, "step": 67 }, { "epoch": 0.7705382436260623, "grad_norm": 1.08573159288467, "learning_rate": 9.279565612556043e-06, "loss": 0.6913, "step": 68 }, { "epoch": 0.7818696883852692, "grad_norm": 1.287771998076043, "learning_rate": 9.244916810456822e-06, "loss": 0.6167, "step": 69 }, { "epoch": 0.7932011331444759, "grad_norm": 1.0734450115631073, "learning_rate": 9.20952213365497e-06, "loss": 0.6048, "step": 70 }, { "epoch": 0.8045325779036827, "grad_norm": 1.041169203868327, "learning_rate": 9.173387801352232e-06, "loss": 0.622, "step": 71 }, { "epoch": 0.8158640226628895, "grad_norm": 1.0441941562582049, "learning_rate": 9.136520162715288e-06, "loss": 0.636, "step": 72 }, { "epoch": 0.8271954674220963, "grad_norm": 1.0266494367822185, "learning_rate": 9.098925695760132e-06, "loss": 0.641, "step": 73 }, { "epoch": 0.8385269121813032, "grad_norm": 1.0525228370033899, "learning_rate": 9.060611006213833e-06, "loss": 0.605, "step": 74 }, { "epoch": 0.8498583569405099, "grad_norm": 1.0169561500024211, "learning_rate": 9.021582826353825e-06, "loss": 0.6691, "step": 75 }, { "epoch": 0.8611898016997167, "grad_norm": 1.0482739302531685, "learning_rate": 8.981848013824995e-06, "loss": 0.6658, "step": 76 }, { "epoch": 0.8725212464589235, "grad_norm": 1.0794377750181379, "learning_rate": 8.94141355043471e-06, "loss": 0.6578, "step": 77 }, { "epoch": 0.8838526912181303, "grad_norm": 1.0439742131558416, "learning_rate": 8.900286540926062e-06, "loss": 0.6138, "step": 78 }, { "epoch": 0.8951841359773371, "grad_norm": 1.072198566934302, "learning_rate": 8.85847421172947e-06, "loss": 0.6313, "step": 79 }, { "epoch": 0.9065155807365439, "grad_norm": 1.0570789500714661, "learning_rate": 8.815983909692941e-06, "loss": 0.611, "step": 80 }, { "epoch": 0.9178470254957507, "grad_norm": 0.9747424186741095, "learning_rate": 8.772823100791152e-06, "loss": 0.6235, "step": 81 }, { "epoch": 0.9291784702549575, "grad_norm": 0.9650403389286071, "learning_rate": 8.728999368813591e-06, "loss": 0.6289, "step": 82 }, { "epoch": 0.9405099150141643, "grad_norm": 1.0608225953186365, "learning_rate": 8.684520414032023e-06, "loss": 0.6534, "step": 83 }, { "epoch": 0.9518413597733711, "grad_norm": 1.0400599060401146, "learning_rate": 8.639394051847472e-06, "loss": 0.6351, "step": 84 }, { "epoch": 0.9631728045325779, "grad_norm": 1.029029843151287, "learning_rate": 8.593628211416964e-06, "loss": 0.637, "step": 85 }, { "epoch": 0.9745042492917847, "grad_norm": 0.9884213872615792, "learning_rate": 8.547230934260313e-06, "loss": 0.6414, "step": 86 }, { "epoch": 0.9858356940509915, "grad_norm": 1.0448881569178157, "learning_rate": 8.500210372847128e-06, "loss": 0.6234, "step": 87 }, { "epoch": 0.9971671388101983, "grad_norm": 1.0141851489732272, "learning_rate": 8.452574789164352e-06, "loss": 0.636, "step": 88 }, { "epoch": 1.0, "grad_norm": 1.0141851489732272, "learning_rate": 8.404332553264548e-06, "loss": 0.6351, "step": 89 }, { "epoch": 1.0113314447592068, "grad_norm": 2.0125074643954024, "learning_rate": 8.355492141795185e-06, "loss": 0.5146, "step": 90 }, { "epoch": 1.0226628895184136, "grad_norm": 1.1306298266109818, "learning_rate": 8.30606213650922e-06, "loss": 0.497, "step": 91 }, { "epoch": 1.0339943342776203, "grad_norm": 1.0714951468489908, "learning_rate": 8.256051222757188e-06, "loss": 0.4921, "step": 92 }, { "epoch": 1.045325779036827, "grad_norm": 0.9830972246185706, "learning_rate": 8.2054681879611e-06, "loss": 0.4906, "step": 93 }, { "epoch": 1.056657223796034, "grad_norm": 0.9632332800113752, "learning_rate": 8.154321920070415e-06, "loss": 0.4657, "step": 94 }, { "epoch": 1.0679886685552409, "grad_norm": 1.193395200214797, "learning_rate": 8.10262140600031e-06, "loss": 0.4861, "step": 95 }, { "epoch": 1.0793201133144477, "grad_norm": 1.28865060369019, "learning_rate": 8.050375730052622e-06, "loss": 0.5093, "step": 96 }, { "epoch": 1.0906515580736544, "grad_norm": 1.247161113611643, "learning_rate": 7.997594072319625e-06, "loss": 0.504, "step": 97 }, { "epoch": 1.1019830028328612, "grad_norm": 1.1321908951225559, "learning_rate": 7.944285707070999e-06, "loss": 0.514, "step": 98 }, { "epoch": 1.113314447592068, "grad_norm": 1.097294675331813, "learning_rate": 7.890460001124242e-06, "loss": 0.5074, "step": 99 }, { "epoch": 1.1246458923512748, "grad_norm": 1.1106766243842143, "learning_rate": 7.836126412198842e-06, "loss": 0.495, "step": 100 }, { "epoch": 1.1359773371104815, "grad_norm": 1.0781028414115594, "learning_rate": 7.781294487254436e-06, "loss": 0.4917, "step": 101 }, { "epoch": 1.1473087818696883, "grad_norm": 1.0597834799331805, "learning_rate": 7.725973860813338e-06, "loss": 0.4953, "step": 102 }, { "epoch": 1.158640226628895, "grad_norm": 1.075317244066298, "learning_rate": 7.67017425326764e-06, "loss": 0.4985, "step": 103 }, { "epoch": 1.1699716713881019, "grad_norm": 1.126814415152867, "learning_rate": 7.613905469171247e-06, "loss": 0.4869, "step": 104 }, { "epoch": 1.1813031161473089, "grad_norm": 1.0228965180222989, "learning_rate": 7.5571773955171124e-06, "loss": 0.4956, "step": 105 }, { "epoch": 1.1926345609065157, "grad_norm": 1.0496260656765666, "learning_rate": 7.500000000000001e-06, "loss": 0.4804, "step": 106 }, { "epoch": 1.2039660056657224, "grad_norm": 1.057021462285616, "learning_rate": 7.442383329265063e-06, "loss": 0.4802, "step": 107 }, { "epoch": 1.2152974504249292, "grad_norm": 1.0586760394529304, "learning_rate": 7.3843375071425315e-06, "loss": 0.4755, "step": 108 }, { "epoch": 1.226628895184136, "grad_norm": 1.098164645835599, "learning_rate": 7.32587273286887e-06, "loss": 0.4806, "step": 109 }, { "epoch": 1.2379603399433428, "grad_norm": 1.0425540537419706, "learning_rate": 7.2669992792946595e-06, "loss": 0.4976, "step": 110 }, { "epoch": 1.2492917847025495, "grad_norm": 1.0081778159600596, "learning_rate": 7.2077274910795605e-06, "loss": 0.4775, "step": 111 }, { "epoch": 1.2606232294617563, "grad_norm": 1.0426051895523285, "learning_rate": 7.14806778287464e-06, "loss": 0.4948, "step": 112 }, { "epoch": 1.271954674220963, "grad_norm": 1.0491543765702032, "learning_rate": 7.088030637492429e-06, "loss": 0.5198, "step": 113 }, { "epoch": 1.28328611898017, "grad_norm": 1.0120042186636362, "learning_rate": 7.02762660406497e-06, "loss": 0.5032, "step": 114 }, { "epoch": 1.2946175637393766, "grad_norm": 1.0538656654185354, "learning_rate": 6.966866296190243e-06, "loss": 0.4835, "step": 115 }, { "epoch": 1.3059490084985836, "grad_norm": 0.983675200448248, "learning_rate": 6.9057603900672355e-06, "loss": 0.4469, "step": 116 }, { "epoch": 1.3172804532577904, "grad_norm": 1.1103412550285476, "learning_rate": 6.844319622620039e-06, "loss": 0.5124, "step": 117 }, { "epoch": 1.3286118980169972, "grad_norm": 1.064307096238654, "learning_rate": 6.782554789611256e-06, "loss": 0.4943, "step": 118 }, { "epoch": 1.339943342776204, "grad_norm": 1.0325281954877101, "learning_rate": 6.7204767437450725e-06, "loss": 0.4703, "step": 119 }, { "epoch": 1.3512747875354107, "grad_norm": 1.0365628583864628, "learning_rate": 6.65809639276034e-06, "loss": 0.494, "step": 120 }, { "epoch": 1.3626062322946175, "grad_norm": 1.0482388627399757, "learning_rate": 6.595424697513963e-06, "loss": 0.4502, "step": 121 }, { "epoch": 1.3739376770538243, "grad_norm": 1.0272064142818405, "learning_rate": 6.532472670054975e-06, "loss": 0.492, "step": 122 }, { "epoch": 1.385269121813031, "grad_norm": 1.0810272879082132, "learning_rate": 6.469251371689606e-06, "loss": 0.4847, "step": 123 }, { "epoch": 1.3966005665722379, "grad_norm": 1.0366197600921454, "learning_rate": 6.405771911037698e-06, "loss": 0.4999, "step": 124 }, { "epoch": 1.4079320113314449, "grad_norm": 1.0295069364200777, "learning_rate": 6.342045442080818e-06, "loss": 0.4783, "step": 125 }, { "epoch": 1.4192634560906516, "grad_norm": 1.0528763013327969, "learning_rate": 6.278083162202374e-06, "loss": 0.4846, "step": 126 }, { "epoch": 1.4305949008498584, "grad_norm": 1.0734593139015471, "learning_rate": 6.21389631022014e-06, "loss": 0.5134, "step": 127 }, { "epoch": 1.4419263456090652, "grad_norm": 1.0207282551843653, "learning_rate": 6.1494961644114685e-06, "loss": 0.4855, "step": 128 }, { "epoch": 1.453257790368272, "grad_norm": 0.9713494112903828, "learning_rate": 6.084894040531591e-06, "loss": 0.4667, "step": 129 }, { "epoch": 1.4645892351274787, "grad_norm": 1.1036048289558185, "learning_rate": 6.0201012898253244e-06, "loss": 0.4905, "step": 130 }, { "epoch": 1.4759206798866855, "grad_norm": 0.996202225854195, "learning_rate": 5.9551292970325394e-06, "loss": 0.4746, "step": 131 }, { "epoch": 1.4872521246458923, "grad_norm": 1.0919133151119662, "learning_rate": 5.8899894783877536e-06, "loss": 0.5201, "step": 132 }, { "epoch": 1.498583569405099, "grad_norm": 1.11280141387768, "learning_rate": 5.824693279614171e-06, "loss": 0.4953, "step": 133 }, { "epoch": 1.509915014164306, "grad_norm": 1.1163217052046956, "learning_rate": 5.759252173912573e-06, "loss": 0.481, "step": 134 }, { "epoch": 1.5212464589235126, "grad_norm": 1.0688323988028812, "learning_rate": 5.693677659945343e-06, "loss": 0.4711, "step": 135 }, { "epoch": 1.5325779036827196, "grad_norm": 0.9512892167994508, "learning_rate": 5.627981259816041e-06, "loss": 0.4697, "step": 136 }, { "epoch": 1.5439093484419264, "grad_norm": 1.0157798339830766, "learning_rate": 5.562174517044862e-06, "loss": 0.4728, "step": 137 }, { "epoch": 1.5552407932011332, "grad_norm": 0.9982778169224142, "learning_rate": 5.496268994540309e-06, "loss": 0.453, "step": 138 }, { "epoch": 1.56657223796034, "grad_norm": 1.1297738773397445, "learning_rate": 5.430276272567485e-06, "loss": 0.495, "step": 139 }, { "epoch": 1.5779036827195467, "grad_norm": 1.0139903899310507, "learning_rate": 5.364207946713318e-06, "loss": 0.4844, "step": 140 }, { "epoch": 1.5892351274787535, "grad_norm": 0.9490126458491319, "learning_rate": 5.2980756258491e-06, "loss": 0.4632, "step": 141 }, { "epoch": 1.6005665722379603, "grad_norm": 0.9789924111916612, "learning_rate": 5.231890930090692e-06, "loss": 0.4641, "step": 142 }, { "epoch": 1.6118980169971673, "grad_norm": 1.0001930502458516, "learning_rate": 5.165665488756755e-06, "loss": 0.4511, "step": 143 }, { "epoch": 1.6232294617563738, "grad_norm": 1.0412278168604834, "learning_rate": 5.099410938325351e-06, "loss": 0.4813, "step": 144 }, { "epoch": 1.6345609065155808, "grad_norm": 2.990945647537025, "learning_rate": 5.033138920389313e-06, "loss": 0.4949, "step": 145 }, { "epoch": 1.6458923512747874, "grad_norm": 0.9622418163601026, "learning_rate": 4.966861079610688e-06, "loss": 0.4855, "step": 146 }, { "epoch": 1.6572237960339944, "grad_norm": 1.0030167678640822, "learning_rate": 4.900589061674649e-06, "loss": 0.4589, "step": 147 }, { "epoch": 1.6685552407932012, "grad_norm": 1.0109166766299091, "learning_rate": 4.8343345112432475e-06, "loss": 0.4778, "step": 148 }, { "epoch": 1.679886685552408, "grad_norm": 1.0402771028968805, "learning_rate": 4.7681090699093076e-06, "loss": 0.4874, "step": 149 }, { "epoch": 1.6912181303116147, "grad_norm": 1.0333160217244122, "learning_rate": 4.701924374150901e-06, "loss": 0.469, "step": 150 }, { "epoch": 1.7025495750708215, "grad_norm": 1.0264878278726923, "learning_rate": 4.635792053286682e-06, "loss": 0.477, "step": 151 }, { "epoch": 1.7138810198300283, "grad_norm": 0.9806277129349131, "learning_rate": 4.569723727432517e-06, "loss": 0.4609, "step": 152 }, { "epoch": 1.725212464589235, "grad_norm": 1.0430109649067774, "learning_rate": 4.5037310054596936e-06, "loss": 0.4852, "step": 153 }, { "epoch": 1.736543909348442, "grad_norm": 1.0177412955604808, "learning_rate": 4.43782548295514e-06, "loss": 0.4538, "step": 154 }, { "epoch": 1.7478753541076486, "grad_norm": 1.0742221754801993, "learning_rate": 4.372018740183961e-06, "loss": 0.502, "step": 155 }, { "epoch": 1.7592067988668556, "grad_norm": 1.2114594760413002, "learning_rate": 4.30632234005466e-06, "loss": 0.4626, "step": 156 }, { "epoch": 1.7705382436260622, "grad_norm": 1.0105219104936058, "learning_rate": 4.2407478260874294e-06, "loss": 0.4443, "step": 157 }, { "epoch": 1.7818696883852692, "grad_norm": 1.0676939421912321, "learning_rate": 4.175306720385831e-06, "loss": 0.461, "step": 158 }, { "epoch": 1.793201133144476, "grad_norm": 1.0843360976121068, "learning_rate": 4.11001052161225e-06, "loss": 0.4562, "step": 159 }, { "epoch": 1.8045325779036827, "grad_norm": 1.0182445190909426, "learning_rate": 4.044870702967461e-06, "loss": 0.4597, "step": 160 }, { "epoch": 1.8158640226628895, "grad_norm": 1.0266398146802735, "learning_rate": 3.979898710174678e-06, "loss": 0.4737, "step": 161 }, { "epoch": 1.8271954674220963, "grad_norm": 1.0375307407230006, "learning_rate": 3.91510595946841e-06, "loss": 0.476, "step": 162 }, { "epoch": 1.8385269121813033, "grad_norm": 1.0510195116895713, "learning_rate": 3.850503835588533e-06, "loss": 0.4572, "step": 163 }, { "epoch": 1.8498583569405098, "grad_norm": 1.0707576258473916, "learning_rate": 3.786103689779861e-06, "loss": 0.4855, "step": 164 }, { "epoch": 1.8611898016997168, "grad_norm": 1.109879197789788, "learning_rate": 3.721916837797627e-06, "loss": 0.4744, "step": 165 }, { "epoch": 1.8725212464589234, "grad_norm": 0.9430434127126872, "learning_rate": 3.6579545579191834e-06, "loss": 0.5036, "step": 166 }, { "epoch": 1.8838526912181304, "grad_norm": 1.0454617136926816, "learning_rate": 3.5942280889623028e-06, "loss": 0.4757, "step": 167 }, { "epoch": 1.8951841359773371, "grad_norm": 0.9669993221473043, "learning_rate": 3.5307486283103966e-06, "loss": 0.4939, "step": 168 }, { "epoch": 1.906515580736544, "grad_norm": 1.1489596332179548, "learning_rate": 3.4675273299450264e-06, "loss": 0.4875, "step": 169 }, { "epoch": 1.9178470254957507, "grad_norm": 1.236638321882873, "learning_rate": 3.4045753024860393e-06, "loss": 0.4899, "step": 170 }, { "epoch": 1.9291784702549575, "grad_norm": 1.0015067232304347, "learning_rate": 3.3419036072396614e-06, "loss": 0.4367, "step": 171 }, { "epoch": 1.9405099150141643, "grad_norm": 0.991139662986458, "learning_rate": 3.2795232562549296e-06, "loss": 0.4593, "step": 172 }, { "epoch": 1.951841359773371, "grad_norm": 1.0171228373147831, "learning_rate": 3.2174452103887455e-06, "loss": 0.4864, "step": 173 }, { "epoch": 1.963172804532578, "grad_norm": 1.0183503025841374, "learning_rate": 3.1556803773799616e-06, "loss": 0.4775, "step": 174 }, { "epoch": 1.9745042492917846, "grad_norm": 0.9658158834425475, "learning_rate": 3.0942396099327645e-06, "loss": 0.4628, "step": 175 }, { "epoch": 1.9858356940509916, "grad_norm": 1.0046391704473616, "learning_rate": 3.03313370380976e-06, "loss": 0.4945, "step": 176 }, { "epoch": 1.9971671388101981, "grad_norm": 0.9746868290860945, "learning_rate": 2.972373395935031e-06, "loss": 0.4384, "step": 177 }, { "epoch": 2.0, "grad_norm": 0.9746868290860945, "learning_rate": 2.911969362507574e-06, "loss": 0.4562, "step": 178 }, { "epoch": 2.011331444759207, "grad_norm": 2.1634786845934584, "learning_rate": 2.8519322171253605e-06, "loss": 0.3576, "step": 179 }, { "epoch": 2.0226628895184136, "grad_norm": 1.259167809122465, "learning_rate": 2.792272508920443e-06, "loss": 0.3306, "step": 180 }, { "epoch": 2.0339943342776206, "grad_norm": 1.3388873888110011, "learning_rate": 2.7330007207053413e-06, "loss": 0.353, "step": 181 }, { "epoch": 2.045325779036827, "grad_norm": 1.1581849151502048, "learning_rate": 2.674127267131131e-06, "loss": 0.3317, "step": 182 }, { "epoch": 2.056657223796034, "grad_norm": 1.0160032268336192, "learning_rate": 2.615662492857471e-06, "loss": 0.3581, "step": 183 }, { "epoch": 2.0679886685552407, "grad_norm": 1.0233678646861728, "learning_rate": 2.5576166707349387e-06, "loss": 0.3359, "step": 184 }, { "epoch": 2.0793201133144477, "grad_norm": 1.0874679159300038, "learning_rate": 2.5000000000000015e-06, "loss": 0.3219, "step": 185 }, { "epoch": 2.090651558073654, "grad_norm": 1.2469998353736902, "learning_rate": 2.4428226044828896e-06, "loss": 0.3271, "step": 186 }, { "epoch": 2.101983002832861, "grad_norm": 1.1847806975199535, "learning_rate": 2.3860945308287554e-06, "loss": 0.3429, "step": 187 }, { "epoch": 2.113314447592068, "grad_norm": 1.3829661881977866, "learning_rate": 2.3298257467323605e-06, "loss": 0.3492, "step": 188 }, { "epoch": 2.1246458923512748, "grad_norm": 1.1118666347289263, "learning_rate": 2.2740261391866634e-06, "loss": 0.3343, "step": 189 }, { "epoch": 2.1359773371104818, "grad_norm": 1.1295786044065697, "learning_rate": 2.2187055127455653e-06, "loss": 0.3306, "step": 190 }, { "epoch": 2.1473087818696883, "grad_norm": 1.3950194361496737, "learning_rate": 2.1638735878011603e-06, "loss": 0.3515, "step": 191 }, { "epoch": 2.1586402266288953, "grad_norm": 1.1337210762125438, "learning_rate": 2.1095399988757574e-06, "loss": 0.3201, "step": 192 }, { "epoch": 2.169971671388102, "grad_norm": 1.059433716116878, "learning_rate": 2.0557142929290027e-06, "loss": 0.3526, "step": 193 }, { "epoch": 2.181303116147309, "grad_norm": 1.0876920114742847, "learning_rate": 2.0024059276803742e-06, "loss": 0.3275, "step": 194 }, { "epoch": 2.1926345609065154, "grad_norm": 1.136528776323311, "learning_rate": 1.949624269947378e-06, "loss": 0.3499, "step": 195 }, { "epoch": 2.2039660056657224, "grad_norm": 1.1195654060494844, "learning_rate": 1.897378593999693e-06, "loss": 0.3105, "step": 196 }, { "epoch": 2.215297450424929, "grad_norm": 1.0686107201673802, "learning_rate": 1.8456780799295888e-06, "loss": 0.3409, "step": 197 }, { "epoch": 2.226628895184136, "grad_norm": 1.1176135978118285, "learning_rate": 1.794531812038901e-06, "loss": 0.3242, "step": 198 }, { "epoch": 2.237960339943343, "grad_norm": 1.1225522593354427, "learning_rate": 1.7439487772428142e-06, "loss": 0.3331, "step": 199 }, { "epoch": 2.2492917847025495, "grad_norm": 1.0504526826797216, "learning_rate": 1.6939378634907815e-06, "loss": 0.3223, "step": 200 }, { "epoch": 2.2606232294617565, "grad_norm": 1.0112717450368687, "learning_rate": 1.6445078582048158e-06, "loss": 0.3328, "step": 201 }, { "epoch": 2.271954674220963, "grad_norm": 1.0056815807805697, "learning_rate": 1.5956674467354538e-06, "loss": 0.3349, "step": 202 }, { "epoch": 2.28328611898017, "grad_norm": 1.0324761445382153, "learning_rate": 1.5474252108356475e-06, "loss": 0.3147, "step": 203 }, { "epoch": 2.2946175637393766, "grad_norm": 3.0756191856725437, "learning_rate": 1.499789627152874e-06, "loss": 0.3148, "step": 204 }, { "epoch": 2.3059490084985836, "grad_norm": 1.14933836933374, "learning_rate": 1.452769065739688e-06, "loss": 0.3487, "step": 205 }, { "epoch": 2.31728045325779, "grad_norm": 0.9691075451255097, "learning_rate": 1.4063717885830375e-06, "loss": 0.3216, "step": 206 }, { "epoch": 2.328611898016997, "grad_norm": 1.2745727227347767, "learning_rate": 1.3606059481525296e-06, "loss": 0.3585, "step": 207 }, { "epoch": 2.3399433427762037, "grad_norm": 0.9868916262509804, "learning_rate": 1.3154795859679781e-06, "loss": 0.3416, "step": 208 }, { "epoch": 2.3512747875354107, "grad_norm": 1.029329657926381, "learning_rate": 1.2710006311864104e-06, "loss": 0.3438, "step": 209 }, { "epoch": 2.3626062322946177, "grad_norm": 1.2301937202365874, "learning_rate": 1.227176899208849e-06, "loss": 0.3232, "step": 210 }, { "epoch": 2.3739376770538243, "grad_norm": 1.1079694734813215, "learning_rate": 1.1840160903070591e-06, "loss": 0.3533, "step": 211 }, { "epoch": 2.3852691218130313, "grad_norm": 1.0355467829487406, "learning_rate": 1.141525788270531e-06, "loss": 0.3455, "step": 212 }, { "epoch": 2.396600566572238, "grad_norm": 1.0285862271263877, "learning_rate": 1.09971345907394e-06, "loss": 0.2994, "step": 213 }, { "epoch": 2.407932011331445, "grad_norm": 1.0633893519411577, "learning_rate": 1.0585864495652899e-06, "loss": 0.3386, "step": 214 }, { "epoch": 2.4192634560906514, "grad_norm": 0.985097331489618, "learning_rate": 1.0181519861750078e-06, "loss": 0.3181, "step": 215 }, { "epoch": 2.4305949008498584, "grad_norm": 0.951413780263271, "learning_rate": 9.784171736461762e-07, "loss": 0.3105, "step": 216 }, { "epoch": 2.441926345609065, "grad_norm": 1.0282273427987358, "learning_rate": 9.393889937861694e-07, "loss": 0.3179, "step": 217 }, { "epoch": 2.453257790368272, "grad_norm": 1.026529941608791, "learning_rate": 9.010743042398684e-07, "loss": 0.3234, "step": 218 }, { "epoch": 2.4645892351274785, "grad_norm": 1.0299442438320148, "learning_rate": 8.634798372847148e-07, "loss": 0.335, "step": 219 }, { "epoch": 2.4759206798866855, "grad_norm": 0.9309231031132973, "learning_rate": 8.266121986477699e-07, "loss": 0.318, "step": 220 }, { "epoch": 2.4872521246458925, "grad_norm": 1.0062159661580126, "learning_rate": 7.904778663450325e-07, "loss": 0.3292, "step": 221 }, { "epoch": 2.498583569405099, "grad_norm": 1.0354919361102888, "learning_rate": 7.550831895431799e-07, "loss": 0.3266, "step": 222 }, { "epoch": 2.509915014164306, "grad_norm": 0.9693415538045153, "learning_rate": 7.204343874439578e-07, "loss": 0.3282, "step": 223 }, { "epoch": 2.5212464589235126, "grad_norm": 1.0178821797615285, "learning_rate": 6.865375481914017e-07, "loss": 0.3561, "step": 224 }, { "epoch": 2.5325779036827196, "grad_norm": 1.0271091771586642, "learning_rate": 6.533986278020876e-07, "loss": 0.3064, "step": 225 }, { "epoch": 2.543909348441926, "grad_norm": 0.9930205073186488, "learning_rate": 6.210234491186079e-07, "loss": 0.318, "step": 226 }, { "epoch": 2.555240793201133, "grad_norm": 1.015466323155115, "learning_rate": 5.894177007864272e-07, "loss": 0.3408, "step": 227 }, { "epoch": 2.56657223796034, "grad_norm": 1.065785552873228, "learning_rate": 5.585869362543416e-07, "loss": 0.3414, "step": 228 }, { "epoch": 2.5779036827195467, "grad_norm": 1.0524927446179813, "learning_rate": 5.285365727986708e-07, "loss": 0.3422, "step": 229 }, { "epoch": 2.5892351274787533, "grad_norm": 1.0219196548167786, "learning_rate": 4.992718905713967e-07, "loss": 0.3388, "step": 230 }, { "epoch": 2.6005665722379603, "grad_norm": 0.9679912813387603, "learning_rate": 4.707980316723837e-07, "loss": 0.3165, "step": 231 }, { "epoch": 2.6118980169971673, "grad_norm": 0.9893500327460035, "learning_rate": 4.431199992458607e-07, "loss": 0.3238, "step": 232 }, { "epoch": 2.623229461756374, "grad_norm": 0.9876579686339385, "learning_rate": 4.16242656601315e-07, "loss": 0.308, "step": 233 }, { "epoch": 2.634560906515581, "grad_norm": 1.01213916356771, "learning_rate": 3.9017072635896716e-07, "loss": 0.331, "step": 234 }, { "epoch": 2.6458923512747874, "grad_norm": 1.0151577294613559, "learning_rate": 3.649087896199488e-07, "loss": 0.3098, "step": 235 }, { "epoch": 2.6572237960339944, "grad_norm": 0.9854787297770221, "learning_rate": 3.404612851613676e-07, "loss": 0.3202, "step": 236 }, { "epoch": 2.668555240793201, "grad_norm": 2.5197939583747866, "learning_rate": 3.168325086563612e-07, "loss": 0.3302, "step": 237 }, { "epoch": 2.679886685552408, "grad_norm": 0.9681009670329549, "learning_rate": 2.9402661191930803e-07, "loss": 0.3221, "step": 238 }, { "epoch": 2.691218130311615, "grad_norm": 1.0155833734622453, "learning_rate": 2.7204760217631074e-07, "loss": 0.324, "step": 239 }, { "epoch": 2.7025495750708215, "grad_norm": 1.1931982505904983, "learning_rate": 2.5089934136108665e-07, "loss": 0.3327, "step": 240 }, { "epoch": 2.713881019830028, "grad_norm": 0.9735860788683143, "learning_rate": 2.30585545436387e-07, "loss": 0.3483, "step": 241 }, { "epoch": 2.725212464589235, "grad_norm": 0.9628214952166717, "learning_rate": 2.1110978374106195e-07, "loss": 0.3455, "step": 242 }, { "epoch": 2.736543909348442, "grad_norm": 1.4367674984114238, "learning_rate": 1.9247547836289792e-07, "loss": 0.3565, "step": 243 }, { "epoch": 2.7478753541076486, "grad_norm": 1.0738794442822241, "learning_rate": 1.7468590353731495e-07, "loss": 0.3577, "step": 244 }, { "epoch": 2.7592067988668556, "grad_norm": 1.0163993435166494, "learning_rate": 1.577441850720568e-07, "loss": 0.3346, "step": 245 }, { "epoch": 2.770538243626062, "grad_norm": 1.1268283470669345, "learning_rate": 1.4165329979794972e-07, "loss": 0.3204, "step": 246 }, { "epoch": 2.781869688385269, "grad_norm": 1.00412302366148, "learning_rate": 1.264160750458493e-07, "loss": 0.3091, "step": 247 }, { "epoch": 2.7932011331444757, "grad_norm": 1.0878323463224275, "learning_rate": 1.1203518814984216e-07, "loss": 0.3219, "step": 248 }, { "epoch": 2.8045325779036827, "grad_norm": 1.0326844241286977, "learning_rate": 9.851316597681959e-08, "loss": 0.3407, "step": 249 }, { "epoch": 2.8158640226628897, "grad_norm": 1.0488660487318535, "learning_rate": 8.585238448247434e-08, "loss": 0.3066, "step": 250 }, { "epoch": 2.8271954674220963, "grad_norm": 0.9440222402450956, "learning_rate": 7.405506829382736e-08, "loss": 0.3007, "step": 251 }, { "epoch": 2.8385269121813033, "grad_norm": 0.9992965787158642, "learning_rate": 6.31232903183332e-08, "loss": 0.3211, "step": 252 }, { "epoch": 2.84985835694051, "grad_norm": 1.0525889142182898, "learning_rate": 5.305897137965199e-08, "loss": 0.3339, "step": 253 }, { "epoch": 2.861189801699717, "grad_norm": 1.0406232501867803, "learning_rate": 4.3863879880142737e-08, "loss": 0.3188, "step": 254 }, { "epoch": 2.8725212464589234, "grad_norm": 1.0108504238438418, "learning_rate": 3.553963149013295e-08, "loss": 0.3426, "step": 255 }, { "epoch": 2.8838526912181304, "grad_norm": 1.040975846702501, "learning_rate": 2.8087688864033014e-08, "loss": 0.3365, "step": 256 }, { "epoch": 2.8951841359773374, "grad_norm": 1.0279134406587973, "learning_rate": 2.1509361383330597e-08, "loss": 0.3167, "step": 257 }, { "epoch": 2.906515580736544, "grad_norm": 1.0127896976081647, "learning_rate": 1.580580492652084e-08, "loss": 0.3589, "step": 258 }, { "epoch": 2.9178470254957505, "grad_norm": 1.002944001928922, "learning_rate": 1.0978021666005479e-08, "loss": 0.3382, "step": 259 }, { "epoch": 2.9291784702549575, "grad_norm": 0.9936405641782646, "learning_rate": 7.02685989200258e-09, "loss": 0.3373, "step": 260 }, { "epoch": 2.9405099150141645, "grad_norm": 1.0916598818224916, "learning_rate": 3.953013863490784e-09, "loss": 0.3124, "step": 261 }, { "epoch": 2.951841359773371, "grad_norm": 0.9881063904383428, "learning_rate": 1.757023686224102e-09, "loss": 0.3401, "step": 262 }, { "epoch": 2.963172804532578, "grad_norm": 0.9953124861905701, "learning_rate": 4.392752178278281e-10, "loss": 0.3202, "step": 263 }, { "epoch": 2.9745042492917846, "grad_norm": 0.9957928075882635, "learning_rate": 0.0, "loss": 0.299, "step": 264 } ], "logging_steps": 1, "max_steps": 264, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 72196646453248.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }