{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.6968838526912182, "eval_steps": 50, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0113314447592068, "grad_norm": Infinity, "learning_rate": 0.0, "loss": 3.3809, "step": 2 }, { "epoch": 0.0226628895184136, "grad_norm": 744.6710815429688, "learning_rate": 0.0, "loss": 3.2718, "step": 4 }, { "epoch": 0.0339943342776204, "grad_norm": 575.2051391601562, "learning_rate": 4e-08, "loss": 3.0156, "step": 6 }, { "epoch": 0.0453257790368272, "grad_norm": 632.8484497070312, "learning_rate": 8e-08, "loss": 3.6857, "step": 8 }, { "epoch": 0.056657223796033995, "grad_norm": 615.0200805664062, "learning_rate": 1.2e-07, "loss": 3.2988, "step": 10 }, { "epoch": 0.0679886685552408, "grad_norm": 807.7252197265625, "learning_rate": 1.6e-07, "loss": 3.798, "step": 12 }, { "epoch": 0.07932011331444759, "grad_norm": 661.0059814453125, "learning_rate": 2e-07, "loss": 2.7579, "step": 14 }, { "epoch": 0.0906515580736544, "grad_norm": 391.60333251953125, "learning_rate": 1.9954022988505746e-07, "loss": 1.749, "step": 16 }, { "epoch": 0.10198300283286119, "grad_norm": 350.3952331542969, "learning_rate": 1.9908045977011495e-07, "loss": 1.5168, "step": 18 }, { "epoch": 0.11331444759206799, "grad_norm": 23.803192138671875, "learning_rate": 1.9862068965517241e-07, "loss": 1.062, "step": 20 }, { "epoch": 0.12464589235127478, "grad_norm": 26.993032455444336, "learning_rate": 1.9816091954022985e-07, "loss": 0.8715, "step": 22 }, { "epoch": 0.1359773371104816, "grad_norm": 20.856576919555664, "learning_rate": 1.9770114942528734e-07, "loss": 0.8205, "step": 24 }, { "epoch": 0.14730878186968838, "grad_norm": 17.476659774780273, "learning_rate": 1.972413793103448e-07, "loss": 0.8846, "step": 26 }, { "epoch": 0.15864022662889518, "grad_norm": 17.18401336669922, "learning_rate": 1.967816091954023e-07, "loss": 0.8203, "step": 28 }, { "epoch": 0.16997167138810199, "grad_norm": 16.565576553344727, "learning_rate": 1.9632183908045977e-07, "loss": 0.8568, "step": 30 }, { "epoch": 0.1813031161473088, "grad_norm": 11.906989097595215, "learning_rate": 1.9586206896551723e-07, "loss": 0.6635, "step": 32 }, { "epoch": 0.19263456090651557, "grad_norm": 12.794220924377441, "learning_rate": 1.954022988505747e-07, "loss": 0.7228, "step": 34 }, { "epoch": 0.20396600566572237, "grad_norm": 12.355205535888672, "learning_rate": 1.9494252873563216e-07, "loss": 0.6619, "step": 36 }, { "epoch": 0.21529745042492918, "grad_norm": 12.685115814208984, "learning_rate": 1.9448275862068963e-07, "loss": 0.5935, "step": 38 }, { "epoch": 0.22662889518413598, "grad_norm": 11.868419647216797, "learning_rate": 1.9402298850574712e-07, "loss": 0.6357, "step": 40 }, { "epoch": 0.23796033994334279, "grad_norm": 12.916513442993164, "learning_rate": 1.9356321839080458e-07, "loss": 0.6429, "step": 42 }, { "epoch": 0.24929178470254956, "grad_norm": 11.732407569885254, "learning_rate": 1.9310344827586208e-07, "loss": 0.6393, "step": 44 }, { "epoch": 0.26062322946175637, "grad_norm": 11.4376802444458, "learning_rate": 1.9264367816091954e-07, "loss": 0.5204, "step": 46 }, { "epoch": 0.2719546742209632, "grad_norm": 10.639026641845703, "learning_rate": 1.92183908045977e-07, "loss": 0.5111, "step": 48 }, { "epoch": 0.28328611898017, "grad_norm": 10.808749198913574, "learning_rate": 1.9172413793103447e-07, "loss": 0.5136, "step": 50 }, { "epoch": 0.28328611898017, "eval_loss": 0.25907036662101746, "eval_runtime": 58.3245, "eval_samples_per_second": 6.001, "eval_steps_per_second": 2.006, "step": 50 }, { "epoch": 0.29461756373937675, "grad_norm": 8.869841575622559, "learning_rate": 1.9126436781609194e-07, "loss": 0.4137, "step": 52 }, { "epoch": 0.3059490084985836, "grad_norm": 9.097506523132324, "learning_rate": 1.908045977011494e-07, "loss": 0.401, "step": 54 }, { "epoch": 0.31728045325779036, "grad_norm": 9.849846839904785, "learning_rate": 1.903448275862069e-07, "loss": 0.495, "step": 56 }, { "epoch": 0.3286118980169972, "grad_norm": 9.619014739990234, "learning_rate": 1.8988505747126436e-07, "loss": 0.4409, "step": 58 }, { "epoch": 0.33994334277620397, "grad_norm": 7.777515411376953, "learning_rate": 1.8942528735632185e-07, "loss": 0.4448, "step": 60 }, { "epoch": 0.35127478753541075, "grad_norm": 7.151308536529541, "learning_rate": 1.889655172413793e-07, "loss": 0.3374, "step": 62 }, { "epoch": 0.3626062322946176, "grad_norm": 8.01666259765625, "learning_rate": 1.8850574712643676e-07, "loss": 0.3785, "step": 64 }, { "epoch": 0.37393767705382436, "grad_norm": 8.04469108581543, "learning_rate": 1.8804597701149425e-07, "loss": 0.4004, "step": 66 }, { "epoch": 0.38526912181303113, "grad_norm": 8.3555326461792, "learning_rate": 1.8758620689655171e-07, "loss": 0.3751, "step": 68 }, { "epoch": 0.39660056657223797, "grad_norm": 6.18545389175415, "learning_rate": 1.8712643678160918e-07, "loss": 0.3077, "step": 70 }, { "epoch": 0.40793201133144474, "grad_norm": 7.299217700958252, "learning_rate": 1.8666666666666667e-07, "loss": 0.3527, "step": 72 }, { "epoch": 0.4192634560906516, "grad_norm": 8.593052864074707, "learning_rate": 1.8620689655172414e-07, "loss": 0.3186, "step": 74 }, { "epoch": 0.43059490084985835, "grad_norm": 7.345818996429443, "learning_rate": 1.857471264367816e-07, "loss": 0.2912, "step": 76 }, { "epoch": 0.44192634560906513, "grad_norm": 7.447642803192139, "learning_rate": 1.8528735632183907e-07, "loss": 0.3105, "step": 78 }, { "epoch": 0.45325779036827196, "grad_norm": 8.07083511352539, "learning_rate": 1.8482758620689653e-07, "loss": 0.2464, "step": 80 }, { "epoch": 0.46458923512747874, "grad_norm": 7.193418979644775, "learning_rate": 1.8436781609195402e-07, "loss": 0.2381, "step": 82 }, { "epoch": 0.47592067988668557, "grad_norm": 7.549230575561523, "learning_rate": 1.839080459770115e-07, "loss": 0.2598, "step": 84 }, { "epoch": 0.48725212464589235, "grad_norm": 7.774433135986328, "learning_rate": 1.8344827586206895e-07, "loss": 0.2261, "step": 86 }, { "epoch": 0.4985835694050991, "grad_norm": 7.496689319610596, "learning_rate": 1.8298850574712642e-07, "loss": 0.2159, "step": 88 }, { "epoch": 0.509915014164306, "grad_norm": 4.929551124572754, "learning_rate": 1.8252873563218388e-07, "loss": 0.1528, "step": 90 }, { "epoch": 0.5212464589235127, "grad_norm": 9.243413925170898, "learning_rate": 1.8206896551724138e-07, "loss": 0.176, "step": 92 }, { "epoch": 0.5325779036827195, "grad_norm": 7.296377182006836, "learning_rate": 1.8160919540229884e-07, "loss": 0.1588, "step": 94 }, { "epoch": 0.5439093484419264, "grad_norm": 9.553417205810547, "learning_rate": 1.811494252873563e-07, "loss": 0.1543, "step": 96 }, { "epoch": 0.5552407932011332, "grad_norm": 7.351907730102539, "learning_rate": 1.806896551724138e-07, "loss": 0.1433, "step": 98 }, { "epoch": 0.56657223796034, "grad_norm": 5.737069129943848, "learning_rate": 1.8022988505747126e-07, "loss": 0.1228, "step": 100 }, { "epoch": 0.56657223796034, "eval_loss": 0.06613212823867798, "eval_runtime": 58.5063, "eval_samples_per_second": 5.982, "eval_steps_per_second": 2.0, "step": 100 }, { "epoch": 0.5779036827195467, "grad_norm": 8.382906913757324, "learning_rate": 1.7977011494252873e-07, "loss": 0.1471, "step": 102 }, { "epoch": 0.5892351274787535, "grad_norm": 6.838889122009277, "learning_rate": 1.793103448275862e-07, "loss": 0.1035, "step": 104 }, { "epoch": 0.6005665722379604, "grad_norm": 6.563055515289307, "learning_rate": 1.7885057471264366e-07, "loss": 0.0872, "step": 106 }, { "epoch": 0.6118980169971672, "grad_norm": 7.472197532653809, "learning_rate": 1.7839080459770115e-07, "loss": 0.1129, "step": 108 }, { "epoch": 0.623229461756374, "grad_norm": 7.348904132843018, "learning_rate": 1.7793103448275862e-07, "loss": 0.1099, "step": 110 }, { "epoch": 0.6345609065155807, "grad_norm": 8.309453010559082, "learning_rate": 1.7747126436781608e-07, "loss": 0.112, "step": 112 }, { "epoch": 0.6458923512747875, "grad_norm": 7.677189826965332, "learning_rate": 1.7701149425287355e-07, "loss": 0.1016, "step": 114 }, { "epoch": 0.6572237960339944, "grad_norm": 6.976114273071289, "learning_rate": 1.7655172413793101e-07, "loss": 0.0862, "step": 116 }, { "epoch": 0.6685552407932012, "grad_norm": 7.3442487716674805, "learning_rate": 1.7609195402298848e-07, "loss": 0.1174, "step": 118 }, { "epoch": 0.6798866855524079, "grad_norm": 6.72037935256958, "learning_rate": 1.7563218390804597e-07, "loss": 0.0906, "step": 120 }, { "epoch": 0.6912181303116147, "grad_norm": 7.490478515625, "learning_rate": 1.7517241379310344e-07, "loss": 0.1107, "step": 122 }, { "epoch": 0.7025495750708215, "grad_norm": 8.412711143493652, "learning_rate": 1.7471264367816093e-07, "loss": 0.1262, "step": 124 }, { "epoch": 0.7138810198300283, "grad_norm": 6.162937641143799, "learning_rate": 1.742528735632184e-07, "loss": 0.0735, "step": 126 }, { "epoch": 0.7252124645892352, "grad_norm": 7.0229363441467285, "learning_rate": 1.7379310344827586e-07, "loss": 0.0767, "step": 128 }, { "epoch": 0.7365439093484419, "grad_norm": 7.203760147094727, "learning_rate": 1.7333333333333332e-07, "loss": 0.0838, "step": 130 }, { "epoch": 0.7478753541076487, "grad_norm": 8.376720428466797, "learning_rate": 1.728735632183908e-07, "loss": 0.0873, "step": 132 }, { "epoch": 0.7592067988668555, "grad_norm": 7.050066947937012, "learning_rate": 1.7241379310344825e-07, "loss": 0.0792, "step": 134 }, { "epoch": 0.7705382436260623, "grad_norm": 8.225197792053223, "learning_rate": 1.7195402298850575e-07, "loss": 0.0975, "step": 136 }, { "epoch": 0.7818696883852692, "grad_norm": 7.450314521789551, "learning_rate": 1.714942528735632e-07, "loss": 0.1, "step": 138 }, { "epoch": 0.7932011331444759, "grad_norm": 7.37244987487793, "learning_rate": 1.710344827586207e-07, "loss": 0.0754, "step": 140 }, { "epoch": 0.8045325779036827, "grad_norm": 7.468136787414551, "learning_rate": 1.7057471264367814e-07, "loss": 0.0794, "step": 142 }, { "epoch": 0.8158640226628895, "grad_norm": 8.50837516784668, "learning_rate": 1.701149425287356e-07, "loss": 0.0868, "step": 144 }, { "epoch": 0.8271954674220963, "grad_norm": 8.161556243896484, "learning_rate": 1.696551724137931e-07, "loss": 0.103, "step": 146 }, { "epoch": 0.8385269121813032, "grad_norm": 7.930548667907715, "learning_rate": 1.6919540229885056e-07, "loss": 0.0788, "step": 148 }, { "epoch": 0.8498583569405099, "grad_norm": 7.564940929412842, "learning_rate": 1.6873563218390803e-07, "loss": 0.0846, "step": 150 }, { "epoch": 0.8498583569405099, "eval_loss": 0.043400950729846954, "eval_runtime": 58.5966, "eval_samples_per_second": 5.973, "eval_steps_per_second": 1.997, "step": 150 }, { "epoch": 0.8611898016997167, "grad_norm": 7.810537338256836, "learning_rate": 1.6827586206896552e-07, "loss": 0.0861, "step": 152 }, { "epoch": 0.8725212464589235, "grad_norm": 6.980551242828369, "learning_rate": 1.67816091954023e-07, "loss": 0.0659, "step": 154 }, { "epoch": 0.8838526912181303, "grad_norm": 7.5854597091674805, "learning_rate": 1.6735632183908045e-07, "loss": 0.0655, "step": 156 }, { "epoch": 0.8951841359773371, "grad_norm": 7.803754806518555, "learning_rate": 1.6689655172413792e-07, "loss": 0.0756, "step": 158 }, { "epoch": 0.9065155807365439, "grad_norm": 6.776433944702148, "learning_rate": 1.6643678160919538e-07, "loss": 0.0661, "step": 160 }, { "epoch": 0.9178470254957507, "grad_norm": 7.673105239868164, "learning_rate": 1.6597701149425287e-07, "loss": 0.105, "step": 162 }, { "epoch": 0.9291784702549575, "grad_norm": 7.120344161987305, "learning_rate": 1.6551724137931034e-07, "loss": 0.0797, "step": 164 }, { "epoch": 0.9405099150141643, "grad_norm": 8.53663158416748, "learning_rate": 1.650574712643678e-07, "loss": 0.0942, "step": 166 }, { "epoch": 0.9518413597733711, "grad_norm": 9.79232406616211, "learning_rate": 1.6459770114942527e-07, "loss": 0.0885, "step": 168 }, { "epoch": 0.9631728045325779, "grad_norm": 7.993936061859131, "learning_rate": 1.6413793103448274e-07, "loss": 0.0696, "step": 170 }, { "epoch": 0.9745042492917847, "grad_norm": 7.143523216247559, "learning_rate": 1.6367816091954023e-07, "loss": 0.0735, "step": 172 }, { "epoch": 0.9858356940509915, "grad_norm": 7.495512008666992, "learning_rate": 1.632183908045977e-07, "loss": 0.0838, "step": 174 }, { "epoch": 0.9971671388101983, "grad_norm": 7.799754619598389, "learning_rate": 1.6275862068965516e-07, "loss": 0.082, "step": 176 }, { "epoch": 1.0056657223796035, "grad_norm": 7.807611465454102, "learning_rate": 1.6229885057471265e-07, "loss": 0.0548, "step": 178 }, { "epoch": 1.0169971671388103, "grad_norm": 6.675009250640869, "learning_rate": 1.6183908045977012e-07, "loss": 0.0841, "step": 180 }, { "epoch": 1.028328611898017, "grad_norm": 6.997397422790527, "learning_rate": 1.6137931034482758e-07, "loss": 0.0602, "step": 182 }, { "epoch": 1.0396600566572238, "grad_norm": 7.82025146484375, "learning_rate": 1.6091954022988505e-07, "loss": 0.0745, "step": 184 }, { "epoch": 1.0509915014164306, "grad_norm": 7.6183180809021, "learning_rate": 1.604597701149425e-07, "loss": 0.0813, "step": 186 }, { "epoch": 1.0623229461756374, "grad_norm": 7.332201957702637, "learning_rate": 1.6e-07, "loss": 0.0693, "step": 188 }, { "epoch": 1.0736543909348442, "grad_norm": 6.992514133453369, "learning_rate": 1.5954022988505747e-07, "loss": 0.0637, "step": 190 }, { "epoch": 1.084985835694051, "grad_norm": 7.306890964508057, "learning_rate": 1.5908045977011493e-07, "loss": 0.0695, "step": 192 }, { "epoch": 1.0963172804532577, "grad_norm": 7.079704284667969, "learning_rate": 1.5862068965517243e-07, "loss": 0.0646, "step": 194 }, { "epoch": 1.1076487252124645, "grad_norm": 7.926516532897949, "learning_rate": 1.5816091954022986e-07, "loss": 0.0676, "step": 196 }, { "epoch": 1.1189801699716715, "grad_norm": 8.795439720153809, "learning_rate": 1.5770114942528733e-07, "loss": 0.0933, "step": 198 }, { "epoch": 1.1303116147308783, "grad_norm": 7.790001392364502, "learning_rate": 1.5724137931034482e-07, "loss": 0.0677, "step": 200 }, { "epoch": 1.1303116147308783, "eval_loss": 0.03847797214984894, "eval_runtime": 58.5674, "eval_samples_per_second": 5.976, "eval_steps_per_second": 1.998, "step": 200 }, { "epoch": 1.141643059490085, "grad_norm": 7.721756458282471, "learning_rate": 1.567816091954023e-07, "loss": 0.0791, "step": 202 }, { "epoch": 1.1529745042492918, "grad_norm": 7.5822343826293945, "learning_rate": 1.5632183908045978e-07, "loss": 0.0766, "step": 204 }, { "epoch": 1.1643059490084986, "grad_norm": 6.944662570953369, "learning_rate": 1.5586206896551724e-07, "loss": 0.0695, "step": 206 }, { "epoch": 1.1756373937677054, "grad_norm": 7.224205493927002, "learning_rate": 1.554022988505747e-07, "loss": 0.0754, "step": 208 }, { "epoch": 1.1869688385269122, "grad_norm": 8.137146949768066, "learning_rate": 1.5494252873563217e-07, "loss": 0.0795, "step": 210 }, { "epoch": 1.198300283286119, "grad_norm": 6.717535018920898, "learning_rate": 1.5448275862068964e-07, "loss": 0.0591, "step": 212 }, { "epoch": 1.2096317280453257, "grad_norm": 7.468085765838623, "learning_rate": 1.540229885057471e-07, "loss": 0.0556, "step": 214 }, { "epoch": 1.2209631728045325, "grad_norm": 7.386537551879883, "learning_rate": 1.535632183908046e-07, "loss": 0.0635, "step": 216 }, { "epoch": 1.2322946175637393, "grad_norm": 6.413255214691162, "learning_rate": 1.5310344827586206e-07, "loss": 0.0586, "step": 218 }, { "epoch": 1.2436260623229463, "grad_norm": 6.197726726531982, "learning_rate": 1.5264367816091955e-07, "loss": 0.0593, "step": 220 }, { "epoch": 1.254957507082153, "grad_norm": 6.984250068664551, "learning_rate": 1.52183908045977e-07, "loss": 0.0687, "step": 222 }, { "epoch": 1.2662889518413598, "grad_norm": 6.338926315307617, "learning_rate": 1.5172413793103446e-07, "loss": 0.0648, "step": 224 }, { "epoch": 1.2776203966005666, "grad_norm": 6.7211384773254395, "learning_rate": 1.5126436781609195e-07, "loss": 0.074, "step": 226 }, { "epoch": 1.2889518413597734, "grad_norm": 7.576125621795654, "learning_rate": 1.5080459770114942e-07, "loss": 0.0701, "step": 228 }, { "epoch": 1.3002832861189801, "grad_norm": 7.268312454223633, "learning_rate": 1.5034482758620688e-07, "loss": 0.069, "step": 230 }, { "epoch": 1.311614730878187, "grad_norm": 7.681404113769531, "learning_rate": 1.4988505747126437e-07, "loss": 0.0644, "step": 232 }, { "epoch": 1.3229461756373937, "grad_norm": 7.724630832672119, "learning_rate": 1.4942528735632184e-07, "loss": 0.0671, "step": 234 }, { "epoch": 1.3342776203966005, "grad_norm": 7.316229820251465, "learning_rate": 1.489655172413793e-07, "loss": 0.059, "step": 236 }, { "epoch": 1.3456090651558075, "grad_norm": 6.948664665222168, "learning_rate": 1.4850574712643677e-07, "loss": 0.0628, "step": 238 }, { "epoch": 1.356940509915014, "grad_norm": 7.636634349822998, "learning_rate": 1.4804597701149423e-07, "loss": 0.0753, "step": 240 }, { "epoch": 1.368271954674221, "grad_norm": 8.362703323364258, "learning_rate": 1.4758620689655173e-07, "loss": 0.0819, "step": 242 }, { "epoch": 1.3796033994334278, "grad_norm": 7.090531826019287, "learning_rate": 1.471264367816092e-07, "loss": 0.0663, "step": 244 }, { "epoch": 1.3909348441926346, "grad_norm": 7.547560214996338, "learning_rate": 1.4666666666666666e-07, "loss": 0.0635, "step": 246 }, { "epoch": 1.4022662889518414, "grad_norm": 7.471890449523926, "learning_rate": 1.4620689655172412e-07, "loss": 0.0517, "step": 248 }, { "epoch": 1.4135977337110481, "grad_norm": 6.707093238830566, "learning_rate": 1.457471264367816e-07, "loss": 0.075, "step": 250 }, { "epoch": 1.4135977337110481, "eval_loss": 0.035781800746917725, "eval_runtime": 58.5812, "eval_samples_per_second": 5.975, "eval_steps_per_second": 1.997, "step": 250 }, { "epoch": 1.424929178470255, "grad_norm": 7.443450450897217, "learning_rate": 1.4528735632183908e-07, "loss": 0.0661, "step": 252 }, { "epoch": 1.4362606232294617, "grad_norm": 7.724794387817383, "learning_rate": 1.4482758620689654e-07, "loss": 0.0651, "step": 254 }, { "epoch": 1.4475920679886687, "grad_norm": 6.492702007293701, "learning_rate": 1.44367816091954e-07, "loss": 0.0481, "step": 256 }, { "epoch": 1.4589235127478752, "grad_norm": 6.878933906555176, "learning_rate": 1.439080459770115e-07, "loss": 0.064, "step": 258 }, { "epoch": 1.4702549575070822, "grad_norm": 7.361611366271973, "learning_rate": 1.4344827586206897e-07, "loss": 0.0644, "step": 260 }, { "epoch": 1.481586402266289, "grad_norm": 6.968555927276611, "learning_rate": 1.4298850574712643e-07, "loss": 0.0609, "step": 262 }, { "epoch": 1.4929178470254958, "grad_norm": 7.533900260925293, "learning_rate": 1.425287356321839e-07, "loss": 0.0607, "step": 264 }, { "epoch": 1.5042492917847026, "grad_norm": 7.612795829772949, "learning_rate": 1.4206896551724136e-07, "loss": 0.0617, "step": 266 }, { "epoch": 1.5155807365439093, "grad_norm": 7.662258148193359, "learning_rate": 1.4160919540229885e-07, "loss": 0.0737, "step": 268 }, { "epoch": 1.5269121813031161, "grad_norm": 7.1722893714904785, "learning_rate": 1.4114942528735632e-07, "loss": 0.0615, "step": 270 }, { "epoch": 1.538243626062323, "grad_norm": 7.579257011413574, "learning_rate": 1.4068965517241379e-07, "loss": 0.0647, "step": 272 }, { "epoch": 1.54957507082153, "grad_norm": 7.028329372406006, "learning_rate": 1.4022988505747128e-07, "loss": 0.0698, "step": 274 }, { "epoch": 1.5609065155807365, "grad_norm": 7.981624126434326, "learning_rate": 1.3977011494252872e-07, "loss": 0.0704, "step": 276 }, { "epoch": 1.5722379603399435, "grad_norm": 7.229209899902344, "learning_rate": 1.3931034482758618e-07, "loss": 0.0662, "step": 278 }, { "epoch": 1.58356940509915, "grad_norm": 8.259303092956543, "learning_rate": 1.3885057471264367e-07, "loss": 0.0622, "step": 280 }, { "epoch": 1.594900849858357, "grad_norm": 8.45683765411377, "learning_rate": 1.3839080459770114e-07, "loss": 0.0846, "step": 282 }, { "epoch": 1.6062322946175638, "grad_norm": 8.067914009094238, "learning_rate": 1.3793103448275863e-07, "loss": 0.0743, "step": 284 }, { "epoch": 1.6175637393767706, "grad_norm": 7.243266582489014, "learning_rate": 1.374712643678161e-07, "loss": 0.0646, "step": 286 }, { "epoch": 1.6288951841359773, "grad_norm": 7.551538944244385, "learning_rate": 1.3701149425287356e-07, "loss": 0.0682, "step": 288 }, { "epoch": 1.6402266288951841, "grad_norm": 8.334272384643555, "learning_rate": 1.3655172413793103e-07, "loss": 0.0614, "step": 290 }, { "epoch": 1.651558073654391, "grad_norm": 7.791774749755859, "learning_rate": 1.360919540229885e-07, "loss": 0.0558, "step": 292 }, { "epoch": 1.6628895184135977, "grad_norm": 8.262134552001953, "learning_rate": 1.3563218390804596e-07, "loss": 0.0575, "step": 294 }, { "epoch": 1.6742209631728047, "grad_norm": 7.904037952423096, "learning_rate": 1.3517241379310345e-07, "loss": 0.079, "step": 296 }, { "epoch": 1.6855524079320112, "grad_norm": 7.991185188293457, "learning_rate": 1.3471264367816091e-07, "loss": 0.0616, "step": 298 }, { "epoch": 1.6968838526912182, "grad_norm": 7.316665172576904, "learning_rate": 1.342528735632184e-07, "loss": 0.0575, "step": 300 }, { "epoch": 1.6968838526912182, "eval_loss": 0.0336877666413784, "eval_runtime": 58.6227, "eval_samples_per_second": 5.97, "eval_steps_per_second": 1.996, "step": 300 } ], "logging_steps": 2, "max_steps": 880, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.292928973307904e+16, "train_batch_size": 3, "trial_name": null, "trial_params": null }