CodeIsAbstract's picture
Upload fine-tuned model
0cd32ae verified
Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity, "... is not valid JSON
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.6968838526912182,
"eval_steps": 50,
"global_step": 300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0113314447592068,
"grad_norm": Infinity,
"learning_rate": 0.0,
"loss": 3.3809,
"step": 2
},
{
"epoch": 0.0226628895184136,
"grad_norm": 744.6710815429688,
"learning_rate": 0.0,
"loss": 3.2718,
"step": 4
},
{
"epoch": 0.0339943342776204,
"grad_norm": 575.2051391601562,
"learning_rate": 4e-08,
"loss": 3.0156,
"step": 6
},
{
"epoch": 0.0453257790368272,
"grad_norm": 632.8484497070312,
"learning_rate": 8e-08,
"loss": 3.6857,
"step": 8
},
{
"epoch": 0.056657223796033995,
"grad_norm": 615.0200805664062,
"learning_rate": 1.2e-07,
"loss": 3.2988,
"step": 10
},
{
"epoch": 0.0679886685552408,
"grad_norm": 807.7252197265625,
"learning_rate": 1.6e-07,
"loss": 3.798,
"step": 12
},
{
"epoch": 0.07932011331444759,
"grad_norm": 661.0059814453125,
"learning_rate": 2e-07,
"loss": 2.7579,
"step": 14
},
{
"epoch": 0.0906515580736544,
"grad_norm": 391.60333251953125,
"learning_rate": 1.9954022988505746e-07,
"loss": 1.749,
"step": 16
},
{
"epoch": 0.10198300283286119,
"grad_norm": 350.3952331542969,
"learning_rate": 1.9908045977011495e-07,
"loss": 1.5168,
"step": 18
},
{
"epoch": 0.11331444759206799,
"grad_norm": 23.803192138671875,
"learning_rate": 1.9862068965517241e-07,
"loss": 1.062,
"step": 20
},
{
"epoch": 0.12464589235127478,
"grad_norm": 26.993032455444336,
"learning_rate": 1.9816091954022985e-07,
"loss": 0.8715,
"step": 22
},
{
"epoch": 0.1359773371104816,
"grad_norm": 20.856576919555664,
"learning_rate": 1.9770114942528734e-07,
"loss": 0.8205,
"step": 24
},
{
"epoch": 0.14730878186968838,
"grad_norm": 17.476659774780273,
"learning_rate": 1.972413793103448e-07,
"loss": 0.8846,
"step": 26
},
{
"epoch": 0.15864022662889518,
"grad_norm": 17.18401336669922,
"learning_rate": 1.967816091954023e-07,
"loss": 0.8203,
"step": 28
},
{
"epoch": 0.16997167138810199,
"grad_norm": 16.565576553344727,
"learning_rate": 1.9632183908045977e-07,
"loss": 0.8568,
"step": 30
},
{
"epoch": 0.1813031161473088,
"grad_norm": 11.906989097595215,
"learning_rate": 1.9586206896551723e-07,
"loss": 0.6635,
"step": 32
},
{
"epoch": 0.19263456090651557,
"grad_norm": 12.794220924377441,
"learning_rate": 1.954022988505747e-07,
"loss": 0.7228,
"step": 34
},
{
"epoch": 0.20396600566572237,
"grad_norm": 12.355205535888672,
"learning_rate": 1.9494252873563216e-07,
"loss": 0.6619,
"step": 36
},
{
"epoch": 0.21529745042492918,
"grad_norm": 12.685115814208984,
"learning_rate": 1.9448275862068963e-07,
"loss": 0.5935,
"step": 38
},
{
"epoch": 0.22662889518413598,
"grad_norm": 11.868419647216797,
"learning_rate": 1.9402298850574712e-07,
"loss": 0.6357,
"step": 40
},
{
"epoch": 0.23796033994334279,
"grad_norm": 12.916513442993164,
"learning_rate": 1.9356321839080458e-07,
"loss": 0.6429,
"step": 42
},
{
"epoch": 0.24929178470254956,
"grad_norm": 11.732407569885254,
"learning_rate": 1.9310344827586208e-07,
"loss": 0.6393,
"step": 44
},
{
"epoch": 0.26062322946175637,
"grad_norm": 11.4376802444458,
"learning_rate": 1.9264367816091954e-07,
"loss": 0.5204,
"step": 46
},
{
"epoch": 0.2719546742209632,
"grad_norm": 10.639026641845703,
"learning_rate": 1.92183908045977e-07,
"loss": 0.5111,
"step": 48
},
{
"epoch": 0.28328611898017,
"grad_norm": 10.808749198913574,
"learning_rate": 1.9172413793103447e-07,
"loss": 0.5136,
"step": 50
},
{
"epoch": 0.28328611898017,
"eval_loss": 0.25907036662101746,
"eval_runtime": 58.3245,
"eval_samples_per_second": 6.001,
"eval_steps_per_second": 2.006,
"step": 50
},
{
"epoch": 0.29461756373937675,
"grad_norm": 8.869841575622559,
"learning_rate": 1.9126436781609194e-07,
"loss": 0.4137,
"step": 52
},
{
"epoch": 0.3059490084985836,
"grad_norm": 9.097506523132324,
"learning_rate": 1.908045977011494e-07,
"loss": 0.401,
"step": 54
},
{
"epoch": 0.31728045325779036,
"grad_norm": 9.849846839904785,
"learning_rate": 1.903448275862069e-07,
"loss": 0.495,
"step": 56
},
{
"epoch": 0.3286118980169972,
"grad_norm": 9.619014739990234,
"learning_rate": 1.8988505747126436e-07,
"loss": 0.4409,
"step": 58
},
{
"epoch": 0.33994334277620397,
"grad_norm": 7.777515411376953,
"learning_rate": 1.8942528735632185e-07,
"loss": 0.4448,
"step": 60
},
{
"epoch": 0.35127478753541075,
"grad_norm": 7.151308536529541,
"learning_rate": 1.889655172413793e-07,
"loss": 0.3374,
"step": 62
},
{
"epoch": 0.3626062322946176,
"grad_norm": 8.01666259765625,
"learning_rate": 1.8850574712643676e-07,
"loss": 0.3785,
"step": 64
},
{
"epoch": 0.37393767705382436,
"grad_norm": 8.04469108581543,
"learning_rate": 1.8804597701149425e-07,
"loss": 0.4004,
"step": 66
},
{
"epoch": 0.38526912181303113,
"grad_norm": 8.3555326461792,
"learning_rate": 1.8758620689655171e-07,
"loss": 0.3751,
"step": 68
},
{
"epoch": 0.39660056657223797,
"grad_norm": 6.18545389175415,
"learning_rate": 1.8712643678160918e-07,
"loss": 0.3077,
"step": 70
},
{
"epoch": 0.40793201133144474,
"grad_norm": 7.299217700958252,
"learning_rate": 1.8666666666666667e-07,
"loss": 0.3527,
"step": 72
},
{
"epoch": 0.4192634560906516,
"grad_norm": 8.593052864074707,
"learning_rate": 1.8620689655172414e-07,
"loss": 0.3186,
"step": 74
},
{
"epoch": 0.43059490084985835,
"grad_norm": 7.345818996429443,
"learning_rate": 1.857471264367816e-07,
"loss": 0.2912,
"step": 76
},
{
"epoch": 0.44192634560906513,
"grad_norm": 7.447642803192139,
"learning_rate": 1.8528735632183907e-07,
"loss": 0.3105,
"step": 78
},
{
"epoch": 0.45325779036827196,
"grad_norm": 8.07083511352539,
"learning_rate": 1.8482758620689653e-07,
"loss": 0.2464,
"step": 80
},
{
"epoch": 0.46458923512747874,
"grad_norm": 7.193418979644775,
"learning_rate": 1.8436781609195402e-07,
"loss": 0.2381,
"step": 82
},
{
"epoch": 0.47592067988668557,
"grad_norm": 7.549230575561523,
"learning_rate": 1.839080459770115e-07,
"loss": 0.2598,
"step": 84
},
{
"epoch": 0.48725212464589235,
"grad_norm": 7.774433135986328,
"learning_rate": 1.8344827586206895e-07,
"loss": 0.2261,
"step": 86
},
{
"epoch": 0.4985835694050991,
"grad_norm": 7.496689319610596,
"learning_rate": 1.8298850574712642e-07,
"loss": 0.2159,
"step": 88
},
{
"epoch": 0.509915014164306,
"grad_norm": 4.929551124572754,
"learning_rate": 1.8252873563218388e-07,
"loss": 0.1528,
"step": 90
},
{
"epoch": 0.5212464589235127,
"grad_norm": 9.243413925170898,
"learning_rate": 1.8206896551724138e-07,
"loss": 0.176,
"step": 92
},
{
"epoch": 0.5325779036827195,
"grad_norm": 7.296377182006836,
"learning_rate": 1.8160919540229884e-07,
"loss": 0.1588,
"step": 94
},
{
"epoch": 0.5439093484419264,
"grad_norm": 9.553417205810547,
"learning_rate": 1.811494252873563e-07,
"loss": 0.1543,
"step": 96
},
{
"epoch": 0.5552407932011332,
"grad_norm": 7.351907730102539,
"learning_rate": 1.806896551724138e-07,
"loss": 0.1433,
"step": 98
},
{
"epoch": 0.56657223796034,
"grad_norm": 5.737069129943848,
"learning_rate": 1.8022988505747126e-07,
"loss": 0.1228,
"step": 100
},
{
"epoch": 0.56657223796034,
"eval_loss": 0.06613212823867798,
"eval_runtime": 58.5063,
"eval_samples_per_second": 5.982,
"eval_steps_per_second": 2.0,
"step": 100
},
{
"epoch": 0.5779036827195467,
"grad_norm": 8.382906913757324,
"learning_rate": 1.7977011494252873e-07,
"loss": 0.1471,
"step": 102
},
{
"epoch": 0.5892351274787535,
"grad_norm": 6.838889122009277,
"learning_rate": 1.793103448275862e-07,
"loss": 0.1035,
"step": 104
},
{
"epoch": 0.6005665722379604,
"grad_norm": 6.563055515289307,
"learning_rate": 1.7885057471264366e-07,
"loss": 0.0872,
"step": 106
},
{
"epoch": 0.6118980169971672,
"grad_norm": 7.472197532653809,
"learning_rate": 1.7839080459770115e-07,
"loss": 0.1129,
"step": 108
},
{
"epoch": 0.623229461756374,
"grad_norm": 7.348904132843018,
"learning_rate": 1.7793103448275862e-07,
"loss": 0.1099,
"step": 110
},
{
"epoch": 0.6345609065155807,
"grad_norm": 8.309453010559082,
"learning_rate": 1.7747126436781608e-07,
"loss": 0.112,
"step": 112
},
{
"epoch": 0.6458923512747875,
"grad_norm": 7.677189826965332,
"learning_rate": 1.7701149425287355e-07,
"loss": 0.1016,
"step": 114
},
{
"epoch": 0.6572237960339944,
"grad_norm": 6.976114273071289,
"learning_rate": 1.7655172413793101e-07,
"loss": 0.0862,
"step": 116
},
{
"epoch": 0.6685552407932012,
"grad_norm": 7.3442487716674805,
"learning_rate": 1.7609195402298848e-07,
"loss": 0.1174,
"step": 118
},
{
"epoch": 0.6798866855524079,
"grad_norm": 6.72037935256958,
"learning_rate": 1.7563218390804597e-07,
"loss": 0.0906,
"step": 120
},
{
"epoch": 0.6912181303116147,
"grad_norm": 7.490478515625,
"learning_rate": 1.7517241379310344e-07,
"loss": 0.1107,
"step": 122
},
{
"epoch": 0.7025495750708215,
"grad_norm": 8.412711143493652,
"learning_rate": 1.7471264367816093e-07,
"loss": 0.1262,
"step": 124
},
{
"epoch": 0.7138810198300283,
"grad_norm": 6.162937641143799,
"learning_rate": 1.742528735632184e-07,
"loss": 0.0735,
"step": 126
},
{
"epoch": 0.7252124645892352,
"grad_norm": 7.0229363441467285,
"learning_rate": 1.7379310344827586e-07,
"loss": 0.0767,
"step": 128
},
{
"epoch": 0.7365439093484419,
"grad_norm": 7.203760147094727,
"learning_rate": 1.7333333333333332e-07,
"loss": 0.0838,
"step": 130
},
{
"epoch": 0.7478753541076487,
"grad_norm": 8.376720428466797,
"learning_rate": 1.728735632183908e-07,
"loss": 0.0873,
"step": 132
},
{
"epoch": 0.7592067988668555,
"grad_norm": 7.050066947937012,
"learning_rate": 1.7241379310344825e-07,
"loss": 0.0792,
"step": 134
},
{
"epoch": 0.7705382436260623,
"grad_norm": 8.225197792053223,
"learning_rate": 1.7195402298850575e-07,
"loss": 0.0975,
"step": 136
},
{
"epoch": 0.7818696883852692,
"grad_norm": 7.450314521789551,
"learning_rate": 1.714942528735632e-07,
"loss": 0.1,
"step": 138
},
{
"epoch": 0.7932011331444759,
"grad_norm": 7.37244987487793,
"learning_rate": 1.710344827586207e-07,
"loss": 0.0754,
"step": 140
},
{
"epoch": 0.8045325779036827,
"grad_norm": 7.468136787414551,
"learning_rate": 1.7057471264367814e-07,
"loss": 0.0794,
"step": 142
},
{
"epoch": 0.8158640226628895,
"grad_norm": 8.50837516784668,
"learning_rate": 1.701149425287356e-07,
"loss": 0.0868,
"step": 144
},
{
"epoch": 0.8271954674220963,
"grad_norm": 8.161556243896484,
"learning_rate": 1.696551724137931e-07,
"loss": 0.103,
"step": 146
},
{
"epoch": 0.8385269121813032,
"grad_norm": 7.930548667907715,
"learning_rate": 1.6919540229885056e-07,
"loss": 0.0788,
"step": 148
},
{
"epoch": 0.8498583569405099,
"grad_norm": 7.564940929412842,
"learning_rate": 1.6873563218390803e-07,
"loss": 0.0846,
"step": 150
},
{
"epoch": 0.8498583569405099,
"eval_loss": 0.043400950729846954,
"eval_runtime": 58.5966,
"eval_samples_per_second": 5.973,
"eval_steps_per_second": 1.997,
"step": 150
},
{
"epoch": 0.8611898016997167,
"grad_norm": 7.810537338256836,
"learning_rate": 1.6827586206896552e-07,
"loss": 0.0861,
"step": 152
},
{
"epoch": 0.8725212464589235,
"grad_norm": 6.980551242828369,
"learning_rate": 1.67816091954023e-07,
"loss": 0.0659,
"step": 154
},
{
"epoch": 0.8838526912181303,
"grad_norm": 7.5854597091674805,
"learning_rate": 1.6735632183908045e-07,
"loss": 0.0655,
"step": 156
},
{
"epoch": 0.8951841359773371,
"grad_norm": 7.803754806518555,
"learning_rate": 1.6689655172413792e-07,
"loss": 0.0756,
"step": 158
},
{
"epoch": 0.9065155807365439,
"grad_norm": 6.776433944702148,
"learning_rate": 1.6643678160919538e-07,
"loss": 0.0661,
"step": 160
},
{
"epoch": 0.9178470254957507,
"grad_norm": 7.673105239868164,
"learning_rate": 1.6597701149425287e-07,
"loss": 0.105,
"step": 162
},
{
"epoch": 0.9291784702549575,
"grad_norm": 7.120344161987305,
"learning_rate": 1.6551724137931034e-07,
"loss": 0.0797,
"step": 164
},
{
"epoch": 0.9405099150141643,
"grad_norm": 8.53663158416748,
"learning_rate": 1.650574712643678e-07,
"loss": 0.0942,
"step": 166
},
{
"epoch": 0.9518413597733711,
"grad_norm": 9.79232406616211,
"learning_rate": 1.6459770114942527e-07,
"loss": 0.0885,
"step": 168
},
{
"epoch": 0.9631728045325779,
"grad_norm": 7.993936061859131,
"learning_rate": 1.6413793103448274e-07,
"loss": 0.0696,
"step": 170
},
{
"epoch": 0.9745042492917847,
"grad_norm": 7.143523216247559,
"learning_rate": 1.6367816091954023e-07,
"loss": 0.0735,
"step": 172
},
{
"epoch": 0.9858356940509915,
"grad_norm": 7.495512008666992,
"learning_rate": 1.632183908045977e-07,
"loss": 0.0838,
"step": 174
},
{
"epoch": 0.9971671388101983,
"grad_norm": 7.799754619598389,
"learning_rate": 1.6275862068965516e-07,
"loss": 0.082,
"step": 176
},
{
"epoch": 1.0056657223796035,
"grad_norm": 7.807611465454102,
"learning_rate": 1.6229885057471265e-07,
"loss": 0.0548,
"step": 178
},
{
"epoch": 1.0169971671388103,
"grad_norm": 6.675009250640869,
"learning_rate": 1.6183908045977012e-07,
"loss": 0.0841,
"step": 180
},
{
"epoch": 1.028328611898017,
"grad_norm": 6.997397422790527,
"learning_rate": 1.6137931034482758e-07,
"loss": 0.0602,
"step": 182
},
{
"epoch": 1.0396600566572238,
"grad_norm": 7.82025146484375,
"learning_rate": 1.6091954022988505e-07,
"loss": 0.0745,
"step": 184
},
{
"epoch": 1.0509915014164306,
"grad_norm": 7.6183180809021,
"learning_rate": 1.604597701149425e-07,
"loss": 0.0813,
"step": 186
},
{
"epoch": 1.0623229461756374,
"grad_norm": 7.332201957702637,
"learning_rate": 1.6e-07,
"loss": 0.0693,
"step": 188
},
{
"epoch": 1.0736543909348442,
"grad_norm": 6.992514133453369,
"learning_rate": 1.5954022988505747e-07,
"loss": 0.0637,
"step": 190
},
{
"epoch": 1.084985835694051,
"grad_norm": 7.306890964508057,
"learning_rate": 1.5908045977011493e-07,
"loss": 0.0695,
"step": 192
},
{
"epoch": 1.0963172804532577,
"grad_norm": 7.079704284667969,
"learning_rate": 1.5862068965517243e-07,
"loss": 0.0646,
"step": 194
},
{
"epoch": 1.1076487252124645,
"grad_norm": 7.926516532897949,
"learning_rate": 1.5816091954022986e-07,
"loss": 0.0676,
"step": 196
},
{
"epoch": 1.1189801699716715,
"grad_norm": 8.795439720153809,
"learning_rate": 1.5770114942528733e-07,
"loss": 0.0933,
"step": 198
},
{
"epoch": 1.1303116147308783,
"grad_norm": 7.790001392364502,
"learning_rate": 1.5724137931034482e-07,
"loss": 0.0677,
"step": 200
},
{
"epoch": 1.1303116147308783,
"eval_loss": 0.03847797214984894,
"eval_runtime": 58.5674,
"eval_samples_per_second": 5.976,
"eval_steps_per_second": 1.998,
"step": 200
},
{
"epoch": 1.141643059490085,
"grad_norm": 7.721756458282471,
"learning_rate": 1.567816091954023e-07,
"loss": 0.0791,
"step": 202
},
{
"epoch": 1.1529745042492918,
"grad_norm": 7.5822343826293945,
"learning_rate": 1.5632183908045978e-07,
"loss": 0.0766,
"step": 204
},
{
"epoch": 1.1643059490084986,
"grad_norm": 6.944662570953369,
"learning_rate": 1.5586206896551724e-07,
"loss": 0.0695,
"step": 206
},
{
"epoch": 1.1756373937677054,
"grad_norm": 7.224205493927002,
"learning_rate": 1.554022988505747e-07,
"loss": 0.0754,
"step": 208
},
{
"epoch": 1.1869688385269122,
"grad_norm": 8.137146949768066,
"learning_rate": 1.5494252873563217e-07,
"loss": 0.0795,
"step": 210
},
{
"epoch": 1.198300283286119,
"grad_norm": 6.717535018920898,
"learning_rate": 1.5448275862068964e-07,
"loss": 0.0591,
"step": 212
},
{
"epoch": 1.2096317280453257,
"grad_norm": 7.468085765838623,
"learning_rate": 1.540229885057471e-07,
"loss": 0.0556,
"step": 214
},
{
"epoch": 1.2209631728045325,
"grad_norm": 7.386537551879883,
"learning_rate": 1.535632183908046e-07,
"loss": 0.0635,
"step": 216
},
{
"epoch": 1.2322946175637393,
"grad_norm": 6.413255214691162,
"learning_rate": 1.5310344827586206e-07,
"loss": 0.0586,
"step": 218
},
{
"epoch": 1.2436260623229463,
"grad_norm": 6.197726726531982,
"learning_rate": 1.5264367816091955e-07,
"loss": 0.0593,
"step": 220
},
{
"epoch": 1.254957507082153,
"grad_norm": 6.984250068664551,
"learning_rate": 1.52183908045977e-07,
"loss": 0.0687,
"step": 222
},
{
"epoch": 1.2662889518413598,
"grad_norm": 6.338926315307617,
"learning_rate": 1.5172413793103446e-07,
"loss": 0.0648,
"step": 224
},
{
"epoch": 1.2776203966005666,
"grad_norm": 6.7211384773254395,
"learning_rate": 1.5126436781609195e-07,
"loss": 0.074,
"step": 226
},
{
"epoch": 1.2889518413597734,
"grad_norm": 7.576125621795654,
"learning_rate": 1.5080459770114942e-07,
"loss": 0.0701,
"step": 228
},
{
"epoch": 1.3002832861189801,
"grad_norm": 7.268312454223633,
"learning_rate": 1.5034482758620688e-07,
"loss": 0.069,
"step": 230
},
{
"epoch": 1.311614730878187,
"grad_norm": 7.681404113769531,
"learning_rate": 1.4988505747126437e-07,
"loss": 0.0644,
"step": 232
},
{
"epoch": 1.3229461756373937,
"grad_norm": 7.724630832672119,
"learning_rate": 1.4942528735632184e-07,
"loss": 0.0671,
"step": 234
},
{
"epoch": 1.3342776203966005,
"grad_norm": 7.316229820251465,
"learning_rate": 1.489655172413793e-07,
"loss": 0.059,
"step": 236
},
{
"epoch": 1.3456090651558075,
"grad_norm": 6.948664665222168,
"learning_rate": 1.4850574712643677e-07,
"loss": 0.0628,
"step": 238
},
{
"epoch": 1.356940509915014,
"grad_norm": 7.636634349822998,
"learning_rate": 1.4804597701149423e-07,
"loss": 0.0753,
"step": 240
},
{
"epoch": 1.368271954674221,
"grad_norm": 8.362703323364258,
"learning_rate": 1.4758620689655173e-07,
"loss": 0.0819,
"step": 242
},
{
"epoch": 1.3796033994334278,
"grad_norm": 7.090531826019287,
"learning_rate": 1.471264367816092e-07,
"loss": 0.0663,
"step": 244
},
{
"epoch": 1.3909348441926346,
"grad_norm": 7.547560214996338,
"learning_rate": 1.4666666666666666e-07,
"loss": 0.0635,
"step": 246
},
{
"epoch": 1.4022662889518414,
"grad_norm": 7.471890449523926,
"learning_rate": 1.4620689655172412e-07,
"loss": 0.0517,
"step": 248
},
{
"epoch": 1.4135977337110481,
"grad_norm": 6.707093238830566,
"learning_rate": 1.457471264367816e-07,
"loss": 0.075,
"step": 250
},
{
"epoch": 1.4135977337110481,
"eval_loss": 0.035781800746917725,
"eval_runtime": 58.5812,
"eval_samples_per_second": 5.975,
"eval_steps_per_second": 1.997,
"step": 250
},
{
"epoch": 1.424929178470255,
"grad_norm": 7.443450450897217,
"learning_rate": 1.4528735632183908e-07,
"loss": 0.0661,
"step": 252
},
{
"epoch": 1.4362606232294617,
"grad_norm": 7.724794387817383,
"learning_rate": 1.4482758620689654e-07,
"loss": 0.0651,
"step": 254
},
{
"epoch": 1.4475920679886687,
"grad_norm": 6.492702007293701,
"learning_rate": 1.44367816091954e-07,
"loss": 0.0481,
"step": 256
},
{
"epoch": 1.4589235127478752,
"grad_norm": 6.878933906555176,
"learning_rate": 1.439080459770115e-07,
"loss": 0.064,
"step": 258
},
{
"epoch": 1.4702549575070822,
"grad_norm": 7.361611366271973,
"learning_rate": 1.4344827586206897e-07,
"loss": 0.0644,
"step": 260
},
{
"epoch": 1.481586402266289,
"grad_norm": 6.968555927276611,
"learning_rate": 1.4298850574712643e-07,
"loss": 0.0609,
"step": 262
},
{
"epoch": 1.4929178470254958,
"grad_norm": 7.533900260925293,
"learning_rate": 1.425287356321839e-07,
"loss": 0.0607,
"step": 264
},
{
"epoch": 1.5042492917847026,
"grad_norm": 7.612795829772949,
"learning_rate": 1.4206896551724136e-07,
"loss": 0.0617,
"step": 266
},
{
"epoch": 1.5155807365439093,
"grad_norm": 7.662258148193359,
"learning_rate": 1.4160919540229885e-07,
"loss": 0.0737,
"step": 268
},
{
"epoch": 1.5269121813031161,
"grad_norm": 7.1722893714904785,
"learning_rate": 1.4114942528735632e-07,
"loss": 0.0615,
"step": 270
},
{
"epoch": 1.538243626062323,
"grad_norm": 7.579257011413574,
"learning_rate": 1.4068965517241379e-07,
"loss": 0.0647,
"step": 272
},
{
"epoch": 1.54957507082153,
"grad_norm": 7.028329372406006,
"learning_rate": 1.4022988505747128e-07,
"loss": 0.0698,
"step": 274
},
{
"epoch": 1.5609065155807365,
"grad_norm": 7.981624126434326,
"learning_rate": 1.3977011494252872e-07,
"loss": 0.0704,
"step": 276
},
{
"epoch": 1.5722379603399435,
"grad_norm": 7.229209899902344,
"learning_rate": 1.3931034482758618e-07,
"loss": 0.0662,
"step": 278
},
{
"epoch": 1.58356940509915,
"grad_norm": 8.259303092956543,
"learning_rate": 1.3885057471264367e-07,
"loss": 0.0622,
"step": 280
},
{
"epoch": 1.594900849858357,
"grad_norm": 8.45683765411377,
"learning_rate": 1.3839080459770114e-07,
"loss": 0.0846,
"step": 282
},
{
"epoch": 1.6062322946175638,
"grad_norm": 8.067914009094238,
"learning_rate": 1.3793103448275863e-07,
"loss": 0.0743,
"step": 284
},
{
"epoch": 1.6175637393767706,
"grad_norm": 7.243266582489014,
"learning_rate": 1.374712643678161e-07,
"loss": 0.0646,
"step": 286
},
{
"epoch": 1.6288951841359773,
"grad_norm": 7.551538944244385,
"learning_rate": 1.3701149425287356e-07,
"loss": 0.0682,
"step": 288
},
{
"epoch": 1.6402266288951841,
"grad_norm": 8.334272384643555,
"learning_rate": 1.3655172413793103e-07,
"loss": 0.0614,
"step": 290
},
{
"epoch": 1.651558073654391,
"grad_norm": 7.791774749755859,
"learning_rate": 1.360919540229885e-07,
"loss": 0.0558,
"step": 292
},
{
"epoch": 1.6628895184135977,
"grad_norm": 8.262134552001953,
"learning_rate": 1.3563218390804596e-07,
"loss": 0.0575,
"step": 294
},
{
"epoch": 1.6742209631728047,
"grad_norm": 7.904037952423096,
"learning_rate": 1.3517241379310345e-07,
"loss": 0.079,
"step": 296
},
{
"epoch": 1.6855524079320112,
"grad_norm": 7.991185188293457,
"learning_rate": 1.3471264367816091e-07,
"loss": 0.0616,
"step": 298
},
{
"epoch": 1.6968838526912182,
"grad_norm": 7.316665172576904,
"learning_rate": 1.342528735632184e-07,
"loss": 0.0575,
"step": 300
},
{
"epoch": 1.6968838526912182,
"eval_loss": 0.0336877666413784,
"eval_runtime": 58.6227,
"eval_samples_per_second": 5.97,
"eval_steps_per_second": 1.996,
"step": 300
}
],
"logging_steps": 2,
"max_steps": 880,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.292928973307904e+16,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}