Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity,
"... is not valid JSON
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.6968838526912182, | |
| "eval_steps": 50, | |
| "global_step": 300, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0113314447592068, | |
| "grad_norm": Infinity, | |
| "learning_rate": 0.0, | |
| "loss": 3.3809, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0226628895184136, | |
| "grad_norm": 744.6710815429688, | |
| "learning_rate": 0.0, | |
| "loss": 3.2718, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0339943342776204, | |
| "grad_norm": 575.2051391601562, | |
| "learning_rate": 4e-08, | |
| "loss": 3.0156, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0453257790368272, | |
| "grad_norm": 632.8484497070312, | |
| "learning_rate": 8e-08, | |
| "loss": 3.6857, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.056657223796033995, | |
| "grad_norm": 615.0200805664062, | |
| "learning_rate": 1.2e-07, | |
| "loss": 3.2988, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0679886685552408, | |
| "grad_norm": 807.7252197265625, | |
| "learning_rate": 1.6e-07, | |
| "loss": 3.798, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.07932011331444759, | |
| "grad_norm": 661.0059814453125, | |
| "learning_rate": 2e-07, | |
| "loss": 2.7579, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.0906515580736544, | |
| "grad_norm": 391.60333251953125, | |
| "learning_rate": 1.9954022988505746e-07, | |
| "loss": 1.749, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.10198300283286119, | |
| "grad_norm": 350.3952331542969, | |
| "learning_rate": 1.9908045977011495e-07, | |
| "loss": 1.5168, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.11331444759206799, | |
| "grad_norm": 23.803192138671875, | |
| "learning_rate": 1.9862068965517241e-07, | |
| "loss": 1.062, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.12464589235127478, | |
| "grad_norm": 26.993032455444336, | |
| "learning_rate": 1.9816091954022985e-07, | |
| "loss": 0.8715, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.1359773371104816, | |
| "grad_norm": 20.856576919555664, | |
| "learning_rate": 1.9770114942528734e-07, | |
| "loss": 0.8205, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.14730878186968838, | |
| "grad_norm": 17.476659774780273, | |
| "learning_rate": 1.972413793103448e-07, | |
| "loss": 0.8846, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.15864022662889518, | |
| "grad_norm": 17.18401336669922, | |
| "learning_rate": 1.967816091954023e-07, | |
| "loss": 0.8203, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.16997167138810199, | |
| "grad_norm": 16.565576553344727, | |
| "learning_rate": 1.9632183908045977e-07, | |
| "loss": 0.8568, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1813031161473088, | |
| "grad_norm": 11.906989097595215, | |
| "learning_rate": 1.9586206896551723e-07, | |
| "loss": 0.6635, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.19263456090651557, | |
| "grad_norm": 12.794220924377441, | |
| "learning_rate": 1.954022988505747e-07, | |
| "loss": 0.7228, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.20396600566572237, | |
| "grad_norm": 12.355205535888672, | |
| "learning_rate": 1.9494252873563216e-07, | |
| "loss": 0.6619, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.21529745042492918, | |
| "grad_norm": 12.685115814208984, | |
| "learning_rate": 1.9448275862068963e-07, | |
| "loss": 0.5935, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.22662889518413598, | |
| "grad_norm": 11.868419647216797, | |
| "learning_rate": 1.9402298850574712e-07, | |
| "loss": 0.6357, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.23796033994334279, | |
| "grad_norm": 12.916513442993164, | |
| "learning_rate": 1.9356321839080458e-07, | |
| "loss": 0.6429, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.24929178470254956, | |
| "grad_norm": 11.732407569885254, | |
| "learning_rate": 1.9310344827586208e-07, | |
| "loss": 0.6393, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.26062322946175637, | |
| "grad_norm": 11.4376802444458, | |
| "learning_rate": 1.9264367816091954e-07, | |
| "loss": 0.5204, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.2719546742209632, | |
| "grad_norm": 10.639026641845703, | |
| "learning_rate": 1.92183908045977e-07, | |
| "loss": 0.5111, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.28328611898017, | |
| "grad_norm": 10.808749198913574, | |
| "learning_rate": 1.9172413793103447e-07, | |
| "loss": 0.5136, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.28328611898017, | |
| "eval_loss": 0.25907036662101746, | |
| "eval_runtime": 58.3245, | |
| "eval_samples_per_second": 6.001, | |
| "eval_steps_per_second": 2.006, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.29461756373937675, | |
| "grad_norm": 8.869841575622559, | |
| "learning_rate": 1.9126436781609194e-07, | |
| "loss": 0.4137, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.3059490084985836, | |
| "grad_norm": 9.097506523132324, | |
| "learning_rate": 1.908045977011494e-07, | |
| "loss": 0.401, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.31728045325779036, | |
| "grad_norm": 9.849846839904785, | |
| "learning_rate": 1.903448275862069e-07, | |
| "loss": 0.495, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.3286118980169972, | |
| "grad_norm": 9.619014739990234, | |
| "learning_rate": 1.8988505747126436e-07, | |
| "loss": 0.4409, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.33994334277620397, | |
| "grad_norm": 7.777515411376953, | |
| "learning_rate": 1.8942528735632185e-07, | |
| "loss": 0.4448, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.35127478753541075, | |
| "grad_norm": 7.151308536529541, | |
| "learning_rate": 1.889655172413793e-07, | |
| "loss": 0.3374, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.3626062322946176, | |
| "grad_norm": 8.01666259765625, | |
| "learning_rate": 1.8850574712643676e-07, | |
| "loss": 0.3785, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.37393767705382436, | |
| "grad_norm": 8.04469108581543, | |
| "learning_rate": 1.8804597701149425e-07, | |
| "loss": 0.4004, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.38526912181303113, | |
| "grad_norm": 8.3555326461792, | |
| "learning_rate": 1.8758620689655171e-07, | |
| "loss": 0.3751, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.39660056657223797, | |
| "grad_norm": 6.18545389175415, | |
| "learning_rate": 1.8712643678160918e-07, | |
| "loss": 0.3077, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.40793201133144474, | |
| "grad_norm": 7.299217700958252, | |
| "learning_rate": 1.8666666666666667e-07, | |
| "loss": 0.3527, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.4192634560906516, | |
| "grad_norm": 8.593052864074707, | |
| "learning_rate": 1.8620689655172414e-07, | |
| "loss": 0.3186, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.43059490084985835, | |
| "grad_norm": 7.345818996429443, | |
| "learning_rate": 1.857471264367816e-07, | |
| "loss": 0.2912, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.44192634560906513, | |
| "grad_norm": 7.447642803192139, | |
| "learning_rate": 1.8528735632183907e-07, | |
| "loss": 0.3105, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.45325779036827196, | |
| "grad_norm": 8.07083511352539, | |
| "learning_rate": 1.8482758620689653e-07, | |
| "loss": 0.2464, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.46458923512747874, | |
| "grad_norm": 7.193418979644775, | |
| "learning_rate": 1.8436781609195402e-07, | |
| "loss": 0.2381, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.47592067988668557, | |
| "grad_norm": 7.549230575561523, | |
| "learning_rate": 1.839080459770115e-07, | |
| "loss": 0.2598, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.48725212464589235, | |
| "grad_norm": 7.774433135986328, | |
| "learning_rate": 1.8344827586206895e-07, | |
| "loss": 0.2261, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.4985835694050991, | |
| "grad_norm": 7.496689319610596, | |
| "learning_rate": 1.8298850574712642e-07, | |
| "loss": 0.2159, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.509915014164306, | |
| "grad_norm": 4.929551124572754, | |
| "learning_rate": 1.8252873563218388e-07, | |
| "loss": 0.1528, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5212464589235127, | |
| "grad_norm": 9.243413925170898, | |
| "learning_rate": 1.8206896551724138e-07, | |
| "loss": 0.176, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.5325779036827195, | |
| "grad_norm": 7.296377182006836, | |
| "learning_rate": 1.8160919540229884e-07, | |
| "loss": 0.1588, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.5439093484419264, | |
| "grad_norm": 9.553417205810547, | |
| "learning_rate": 1.811494252873563e-07, | |
| "loss": 0.1543, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.5552407932011332, | |
| "grad_norm": 7.351907730102539, | |
| "learning_rate": 1.806896551724138e-07, | |
| "loss": 0.1433, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.56657223796034, | |
| "grad_norm": 5.737069129943848, | |
| "learning_rate": 1.8022988505747126e-07, | |
| "loss": 0.1228, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.56657223796034, | |
| "eval_loss": 0.06613212823867798, | |
| "eval_runtime": 58.5063, | |
| "eval_samples_per_second": 5.982, | |
| "eval_steps_per_second": 2.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5779036827195467, | |
| "grad_norm": 8.382906913757324, | |
| "learning_rate": 1.7977011494252873e-07, | |
| "loss": 0.1471, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.5892351274787535, | |
| "grad_norm": 6.838889122009277, | |
| "learning_rate": 1.793103448275862e-07, | |
| "loss": 0.1035, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.6005665722379604, | |
| "grad_norm": 6.563055515289307, | |
| "learning_rate": 1.7885057471264366e-07, | |
| "loss": 0.0872, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.6118980169971672, | |
| "grad_norm": 7.472197532653809, | |
| "learning_rate": 1.7839080459770115e-07, | |
| "loss": 0.1129, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.623229461756374, | |
| "grad_norm": 7.348904132843018, | |
| "learning_rate": 1.7793103448275862e-07, | |
| "loss": 0.1099, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6345609065155807, | |
| "grad_norm": 8.309453010559082, | |
| "learning_rate": 1.7747126436781608e-07, | |
| "loss": 0.112, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.6458923512747875, | |
| "grad_norm": 7.677189826965332, | |
| "learning_rate": 1.7701149425287355e-07, | |
| "loss": 0.1016, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.6572237960339944, | |
| "grad_norm": 6.976114273071289, | |
| "learning_rate": 1.7655172413793101e-07, | |
| "loss": 0.0862, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.6685552407932012, | |
| "grad_norm": 7.3442487716674805, | |
| "learning_rate": 1.7609195402298848e-07, | |
| "loss": 0.1174, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.6798866855524079, | |
| "grad_norm": 6.72037935256958, | |
| "learning_rate": 1.7563218390804597e-07, | |
| "loss": 0.0906, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.6912181303116147, | |
| "grad_norm": 7.490478515625, | |
| "learning_rate": 1.7517241379310344e-07, | |
| "loss": 0.1107, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.7025495750708215, | |
| "grad_norm": 8.412711143493652, | |
| "learning_rate": 1.7471264367816093e-07, | |
| "loss": 0.1262, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.7138810198300283, | |
| "grad_norm": 6.162937641143799, | |
| "learning_rate": 1.742528735632184e-07, | |
| "loss": 0.0735, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.7252124645892352, | |
| "grad_norm": 7.0229363441467285, | |
| "learning_rate": 1.7379310344827586e-07, | |
| "loss": 0.0767, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.7365439093484419, | |
| "grad_norm": 7.203760147094727, | |
| "learning_rate": 1.7333333333333332e-07, | |
| "loss": 0.0838, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7478753541076487, | |
| "grad_norm": 8.376720428466797, | |
| "learning_rate": 1.728735632183908e-07, | |
| "loss": 0.0873, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.7592067988668555, | |
| "grad_norm": 7.050066947937012, | |
| "learning_rate": 1.7241379310344825e-07, | |
| "loss": 0.0792, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.7705382436260623, | |
| "grad_norm": 8.225197792053223, | |
| "learning_rate": 1.7195402298850575e-07, | |
| "loss": 0.0975, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.7818696883852692, | |
| "grad_norm": 7.450314521789551, | |
| "learning_rate": 1.714942528735632e-07, | |
| "loss": 0.1, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.7932011331444759, | |
| "grad_norm": 7.37244987487793, | |
| "learning_rate": 1.710344827586207e-07, | |
| "loss": 0.0754, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.8045325779036827, | |
| "grad_norm": 7.468136787414551, | |
| "learning_rate": 1.7057471264367814e-07, | |
| "loss": 0.0794, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.8158640226628895, | |
| "grad_norm": 8.50837516784668, | |
| "learning_rate": 1.701149425287356e-07, | |
| "loss": 0.0868, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.8271954674220963, | |
| "grad_norm": 8.161556243896484, | |
| "learning_rate": 1.696551724137931e-07, | |
| "loss": 0.103, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.8385269121813032, | |
| "grad_norm": 7.930548667907715, | |
| "learning_rate": 1.6919540229885056e-07, | |
| "loss": 0.0788, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.8498583569405099, | |
| "grad_norm": 7.564940929412842, | |
| "learning_rate": 1.6873563218390803e-07, | |
| "loss": 0.0846, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8498583569405099, | |
| "eval_loss": 0.043400950729846954, | |
| "eval_runtime": 58.5966, | |
| "eval_samples_per_second": 5.973, | |
| "eval_steps_per_second": 1.997, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8611898016997167, | |
| "grad_norm": 7.810537338256836, | |
| "learning_rate": 1.6827586206896552e-07, | |
| "loss": 0.0861, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.8725212464589235, | |
| "grad_norm": 6.980551242828369, | |
| "learning_rate": 1.67816091954023e-07, | |
| "loss": 0.0659, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.8838526912181303, | |
| "grad_norm": 7.5854597091674805, | |
| "learning_rate": 1.6735632183908045e-07, | |
| "loss": 0.0655, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.8951841359773371, | |
| "grad_norm": 7.803754806518555, | |
| "learning_rate": 1.6689655172413792e-07, | |
| "loss": 0.0756, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.9065155807365439, | |
| "grad_norm": 6.776433944702148, | |
| "learning_rate": 1.6643678160919538e-07, | |
| "loss": 0.0661, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.9178470254957507, | |
| "grad_norm": 7.673105239868164, | |
| "learning_rate": 1.6597701149425287e-07, | |
| "loss": 0.105, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.9291784702549575, | |
| "grad_norm": 7.120344161987305, | |
| "learning_rate": 1.6551724137931034e-07, | |
| "loss": 0.0797, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.9405099150141643, | |
| "grad_norm": 8.53663158416748, | |
| "learning_rate": 1.650574712643678e-07, | |
| "loss": 0.0942, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.9518413597733711, | |
| "grad_norm": 9.79232406616211, | |
| "learning_rate": 1.6459770114942527e-07, | |
| "loss": 0.0885, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.9631728045325779, | |
| "grad_norm": 7.993936061859131, | |
| "learning_rate": 1.6413793103448274e-07, | |
| "loss": 0.0696, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.9745042492917847, | |
| "grad_norm": 7.143523216247559, | |
| "learning_rate": 1.6367816091954023e-07, | |
| "loss": 0.0735, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.9858356940509915, | |
| "grad_norm": 7.495512008666992, | |
| "learning_rate": 1.632183908045977e-07, | |
| "loss": 0.0838, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.9971671388101983, | |
| "grad_norm": 7.799754619598389, | |
| "learning_rate": 1.6275862068965516e-07, | |
| "loss": 0.082, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.0056657223796035, | |
| "grad_norm": 7.807611465454102, | |
| "learning_rate": 1.6229885057471265e-07, | |
| "loss": 0.0548, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.0169971671388103, | |
| "grad_norm": 6.675009250640869, | |
| "learning_rate": 1.6183908045977012e-07, | |
| "loss": 0.0841, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.028328611898017, | |
| "grad_norm": 6.997397422790527, | |
| "learning_rate": 1.6137931034482758e-07, | |
| "loss": 0.0602, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.0396600566572238, | |
| "grad_norm": 7.82025146484375, | |
| "learning_rate": 1.6091954022988505e-07, | |
| "loss": 0.0745, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.0509915014164306, | |
| "grad_norm": 7.6183180809021, | |
| "learning_rate": 1.604597701149425e-07, | |
| "loss": 0.0813, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.0623229461756374, | |
| "grad_norm": 7.332201957702637, | |
| "learning_rate": 1.6e-07, | |
| "loss": 0.0693, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.0736543909348442, | |
| "grad_norm": 6.992514133453369, | |
| "learning_rate": 1.5954022988505747e-07, | |
| "loss": 0.0637, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.084985835694051, | |
| "grad_norm": 7.306890964508057, | |
| "learning_rate": 1.5908045977011493e-07, | |
| "loss": 0.0695, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.0963172804532577, | |
| "grad_norm": 7.079704284667969, | |
| "learning_rate": 1.5862068965517243e-07, | |
| "loss": 0.0646, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.1076487252124645, | |
| "grad_norm": 7.926516532897949, | |
| "learning_rate": 1.5816091954022986e-07, | |
| "loss": 0.0676, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.1189801699716715, | |
| "grad_norm": 8.795439720153809, | |
| "learning_rate": 1.5770114942528733e-07, | |
| "loss": 0.0933, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.1303116147308783, | |
| "grad_norm": 7.790001392364502, | |
| "learning_rate": 1.5724137931034482e-07, | |
| "loss": 0.0677, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.1303116147308783, | |
| "eval_loss": 0.03847797214984894, | |
| "eval_runtime": 58.5674, | |
| "eval_samples_per_second": 5.976, | |
| "eval_steps_per_second": 1.998, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.141643059490085, | |
| "grad_norm": 7.721756458282471, | |
| "learning_rate": 1.567816091954023e-07, | |
| "loss": 0.0791, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.1529745042492918, | |
| "grad_norm": 7.5822343826293945, | |
| "learning_rate": 1.5632183908045978e-07, | |
| "loss": 0.0766, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.1643059490084986, | |
| "grad_norm": 6.944662570953369, | |
| "learning_rate": 1.5586206896551724e-07, | |
| "loss": 0.0695, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.1756373937677054, | |
| "grad_norm": 7.224205493927002, | |
| "learning_rate": 1.554022988505747e-07, | |
| "loss": 0.0754, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.1869688385269122, | |
| "grad_norm": 8.137146949768066, | |
| "learning_rate": 1.5494252873563217e-07, | |
| "loss": 0.0795, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.198300283286119, | |
| "grad_norm": 6.717535018920898, | |
| "learning_rate": 1.5448275862068964e-07, | |
| "loss": 0.0591, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.2096317280453257, | |
| "grad_norm": 7.468085765838623, | |
| "learning_rate": 1.540229885057471e-07, | |
| "loss": 0.0556, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.2209631728045325, | |
| "grad_norm": 7.386537551879883, | |
| "learning_rate": 1.535632183908046e-07, | |
| "loss": 0.0635, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.2322946175637393, | |
| "grad_norm": 6.413255214691162, | |
| "learning_rate": 1.5310344827586206e-07, | |
| "loss": 0.0586, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.2436260623229463, | |
| "grad_norm": 6.197726726531982, | |
| "learning_rate": 1.5264367816091955e-07, | |
| "loss": 0.0593, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.254957507082153, | |
| "grad_norm": 6.984250068664551, | |
| "learning_rate": 1.52183908045977e-07, | |
| "loss": 0.0687, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.2662889518413598, | |
| "grad_norm": 6.338926315307617, | |
| "learning_rate": 1.5172413793103446e-07, | |
| "loss": 0.0648, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.2776203966005666, | |
| "grad_norm": 6.7211384773254395, | |
| "learning_rate": 1.5126436781609195e-07, | |
| "loss": 0.074, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.2889518413597734, | |
| "grad_norm": 7.576125621795654, | |
| "learning_rate": 1.5080459770114942e-07, | |
| "loss": 0.0701, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.3002832861189801, | |
| "grad_norm": 7.268312454223633, | |
| "learning_rate": 1.5034482758620688e-07, | |
| "loss": 0.069, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.311614730878187, | |
| "grad_norm": 7.681404113769531, | |
| "learning_rate": 1.4988505747126437e-07, | |
| "loss": 0.0644, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.3229461756373937, | |
| "grad_norm": 7.724630832672119, | |
| "learning_rate": 1.4942528735632184e-07, | |
| "loss": 0.0671, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.3342776203966005, | |
| "grad_norm": 7.316229820251465, | |
| "learning_rate": 1.489655172413793e-07, | |
| "loss": 0.059, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.3456090651558075, | |
| "grad_norm": 6.948664665222168, | |
| "learning_rate": 1.4850574712643677e-07, | |
| "loss": 0.0628, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.356940509915014, | |
| "grad_norm": 7.636634349822998, | |
| "learning_rate": 1.4804597701149423e-07, | |
| "loss": 0.0753, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.368271954674221, | |
| "grad_norm": 8.362703323364258, | |
| "learning_rate": 1.4758620689655173e-07, | |
| "loss": 0.0819, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.3796033994334278, | |
| "grad_norm": 7.090531826019287, | |
| "learning_rate": 1.471264367816092e-07, | |
| "loss": 0.0663, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.3909348441926346, | |
| "grad_norm": 7.547560214996338, | |
| "learning_rate": 1.4666666666666666e-07, | |
| "loss": 0.0635, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.4022662889518414, | |
| "grad_norm": 7.471890449523926, | |
| "learning_rate": 1.4620689655172412e-07, | |
| "loss": 0.0517, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.4135977337110481, | |
| "grad_norm": 6.707093238830566, | |
| "learning_rate": 1.457471264367816e-07, | |
| "loss": 0.075, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.4135977337110481, | |
| "eval_loss": 0.035781800746917725, | |
| "eval_runtime": 58.5812, | |
| "eval_samples_per_second": 5.975, | |
| "eval_steps_per_second": 1.997, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.424929178470255, | |
| "grad_norm": 7.443450450897217, | |
| "learning_rate": 1.4528735632183908e-07, | |
| "loss": 0.0661, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.4362606232294617, | |
| "grad_norm": 7.724794387817383, | |
| "learning_rate": 1.4482758620689654e-07, | |
| "loss": 0.0651, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.4475920679886687, | |
| "grad_norm": 6.492702007293701, | |
| "learning_rate": 1.44367816091954e-07, | |
| "loss": 0.0481, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.4589235127478752, | |
| "grad_norm": 6.878933906555176, | |
| "learning_rate": 1.439080459770115e-07, | |
| "loss": 0.064, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.4702549575070822, | |
| "grad_norm": 7.361611366271973, | |
| "learning_rate": 1.4344827586206897e-07, | |
| "loss": 0.0644, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.481586402266289, | |
| "grad_norm": 6.968555927276611, | |
| "learning_rate": 1.4298850574712643e-07, | |
| "loss": 0.0609, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.4929178470254958, | |
| "grad_norm": 7.533900260925293, | |
| "learning_rate": 1.425287356321839e-07, | |
| "loss": 0.0607, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.5042492917847026, | |
| "grad_norm": 7.612795829772949, | |
| "learning_rate": 1.4206896551724136e-07, | |
| "loss": 0.0617, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.5155807365439093, | |
| "grad_norm": 7.662258148193359, | |
| "learning_rate": 1.4160919540229885e-07, | |
| "loss": 0.0737, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.5269121813031161, | |
| "grad_norm": 7.1722893714904785, | |
| "learning_rate": 1.4114942528735632e-07, | |
| "loss": 0.0615, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.538243626062323, | |
| "grad_norm": 7.579257011413574, | |
| "learning_rate": 1.4068965517241379e-07, | |
| "loss": 0.0647, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.54957507082153, | |
| "grad_norm": 7.028329372406006, | |
| "learning_rate": 1.4022988505747128e-07, | |
| "loss": 0.0698, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.5609065155807365, | |
| "grad_norm": 7.981624126434326, | |
| "learning_rate": 1.3977011494252872e-07, | |
| "loss": 0.0704, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.5722379603399435, | |
| "grad_norm": 7.229209899902344, | |
| "learning_rate": 1.3931034482758618e-07, | |
| "loss": 0.0662, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.58356940509915, | |
| "grad_norm": 8.259303092956543, | |
| "learning_rate": 1.3885057471264367e-07, | |
| "loss": 0.0622, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.594900849858357, | |
| "grad_norm": 8.45683765411377, | |
| "learning_rate": 1.3839080459770114e-07, | |
| "loss": 0.0846, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.6062322946175638, | |
| "grad_norm": 8.067914009094238, | |
| "learning_rate": 1.3793103448275863e-07, | |
| "loss": 0.0743, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.6175637393767706, | |
| "grad_norm": 7.243266582489014, | |
| "learning_rate": 1.374712643678161e-07, | |
| "loss": 0.0646, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.6288951841359773, | |
| "grad_norm": 7.551538944244385, | |
| "learning_rate": 1.3701149425287356e-07, | |
| "loss": 0.0682, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.6402266288951841, | |
| "grad_norm": 8.334272384643555, | |
| "learning_rate": 1.3655172413793103e-07, | |
| "loss": 0.0614, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.651558073654391, | |
| "grad_norm": 7.791774749755859, | |
| "learning_rate": 1.360919540229885e-07, | |
| "loss": 0.0558, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.6628895184135977, | |
| "grad_norm": 8.262134552001953, | |
| "learning_rate": 1.3563218390804596e-07, | |
| "loss": 0.0575, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.6742209631728047, | |
| "grad_norm": 7.904037952423096, | |
| "learning_rate": 1.3517241379310345e-07, | |
| "loss": 0.079, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.6855524079320112, | |
| "grad_norm": 7.991185188293457, | |
| "learning_rate": 1.3471264367816091e-07, | |
| "loss": 0.0616, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.6968838526912182, | |
| "grad_norm": 7.316665172576904, | |
| "learning_rate": 1.342528735632184e-07, | |
| "loss": 0.0575, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.6968838526912182, | |
| "eval_loss": 0.0336877666413784, | |
| "eval_runtime": 58.6227, | |
| "eval_samples_per_second": 5.97, | |
| "eval_steps_per_second": 1.996, | |
| "step": 300 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 880, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.292928973307904e+16, | |
| "train_batch_size": 3, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |