| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 336, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005961251862891207, |
| "grad_norm": 83.69901275634766, |
| "learning_rate": 0.0, |
| "loss": 2.889, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.011922503725782414, |
| "grad_norm": 101.080810546875, |
| "learning_rate": 4.5454545454545457e-07, |
| "loss": 2.8534, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.01788375558867362, |
| "grad_norm": 133.3448028564453, |
| "learning_rate": 9.090909090909091e-07, |
| "loss": 2.7436, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.02384500745156483, |
| "grad_norm": 181.99168395996094, |
| "learning_rate": 1.3636363636363636e-06, |
| "loss": 2.9621, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.029806259314456036, |
| "grad_norm": 7933.1826171875, |
| "learning_rate": 1.8181818181818183e-06, |
| "loss": 2.7936, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.03576751117734724, |
| "grad_norm": 97.71904754638672, |
| "learning_rate": 2.2727272727272728e-06, |
| "loss": 2.8072, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.041728763040238454, |
| "grad_norm": 275.55499267578125, |
| "learning_rate": 2.7272727272727272e-06, |
| "loss": 2.8261, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.04769001490312966, |
| "grad_norm": 64.11652374267578, |
| "learning_rate": 3.181818181818182e-06, |
| "loss": 2.7701, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.05365126676602087, |
| "grad_norm": 73.77549743652344, |
| "learning_rate": 3.6363636363636366e-06, |
| "loss": 2.5822, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.05961251862891207, |
| "grad_norm": 69.88496398925781, |
| "learning_rate": 4.0909090909090915e-06, |
| "loss": 2.2658, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.06557377049180328, |
| "grad_norm": 34.03704833984375, |
| "learning_rate": 4.5454545454545455e-06, |
| "loss": 2.0155, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.07153502235469449, |
| "grad_norm": 47.30327224731445, |
| "learning_rate": 5e-06, |
| "loss": 1.9482, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.07749627421758569, |
| "grad_norm": 36.03778076171875, |
| "learning_rate": 4.9998832008573975e-06, |
| "loss": 1.9698, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.08345752608047691, |
| "grad_norm": 28.477731704711914, |
| "learning_rate": 4.999532814343219e-06, |
| "loss": 1.9087, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.08941877794336811, |
| "grad_norm": 20.54388427734375, |
| "learning_rate": 4.998948873197342e-06, |
| "loss": 2.0298, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.09538002980625931, |
| "grad_norm": 21.707738876342773, |
| "learning_rate": 4.998131431982826e-06, |
| "loss": 1.8872, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.10134128166915052, |
| "grad_norm": 16.112577438354492, |
| "learning_rate": 4.9970805670808174e-06, |
| "loss": 2.0104, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.10730253353204174, |
| "grad_norm": 14.217989921569824, |
| "learning_rate": 4.995796376683411e-06, |
| "loss": 1.9303, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.11326378539493294, |
| "grad_norm": 15.643560409545898, |
| "learning_rate": 4.994278980784478e-06, |
| "loss": 1.9199, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.11922503725782414, |
| "grad_norm": 17.744998931884766, |
| "learning_rate": 4.992528521168449e-06, |
| "loss": 1.9722, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.12518628912071536, |
| "grad_norm": 13.886046409606934, |
| "learning_rate": 4.990545161397073e-06, |
| "loss": 1.8426, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.13114754098360656, |
| "grad_norm": 11.871990203857422, |
| "learning_rate": 4.988329086794122e-06, |
| "loss": 1.9522, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.13710879284649777, |
| "grad_norm": 13.753679275512695, |
| "learning_rate": 4.98588050442809e-06, |
| "loss": 1.9687, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.14307004470938897, |
| "grad_norm": 17.359724044799805, |
| "learning_rate": 4.983199643092833e-06, |
| "loss": 1.9103, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.14903129657228018, |
| "grad_norm": 12.743254661560059, |
| "learning_rate": 4.980286753286196e-06, |
| "loss": 1.8854, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.15499254843517138, |
| "grad_norm": 13.894977569580078, |
| "learning_rate": 4.977142107186602e-06, |
| "loss": 1.8205, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.16095380029806258, |
| "grad_norm": 45.56089401245117, |
| "learning_rate": 4.973765998627628e-06, |
| "loss": 1.8602, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.16691505216095381, |
| "grad_norm": 12.6051664352417, |
| "learning_rate": 4.970158743070542e-06, |
| "loss": 1.7526, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.17287630402384502, |
| "grad_norm": 15.922001838684082, |
| "learning_rate": 4.966320677574828e-06, |
| "loss": 1.7807, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.17883755588673622, |
| "grad_norm": 17.59123992919922, |
| "learning_rate": 4.9622521607666936e-06, |
| "loss": 1.8304, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.18479880774962743, |
| "grad_norm": 35.71995162963867, |
| "learning_rate": 4.957953572805558e-06, |
| "loss": 1.8205, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.19076005961251863, |
| "grad_norm": 14.873682975769043, |
| "learning_rate": 4.953425315348534e-06, |
| "loss": 1.7274, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.19672131147540983, |
| "grad_norm": 11.887964248657227, |
| "learning_rate": 4.94866781151289e-06, |
| "loss": 1.8059, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.20268256333830104, |
| "grad_norm": 20.716062545776367, |
| "learning_rate": 4.943681505836523e-06, |
| "loss": 1.7239, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.20864381520119224, |
| "grad_norm": 16.12997817993164, |
| "learning_rate": 4.938466864236413e-06, |
| "loss": 1.6606, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.21460506706408347, |
| "grad_norm": 16.572912216186523, |
| "learning_rate": 4.933024373965097e-06, |
| "loss": 1.7082, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.22056631892697467, |
| "grad_norm": 15.159687042236328, |
| "learning_rate": 4.927354543565131e-06, |
| "loss": 1.5943, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.22652757078986588, |
| "grad_norm": 24.082124710083008, |
| "learning_rate": 4.921457902821578e-06, |
| "loss": 1.6194, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.23248882265275708, |
| "grad_norm": 14.97282600402832, |
| "learning_rate": 4.915335002712506e-06, |
| "loss": 1.6057, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.23845007451564829, |
| "grad_norm": 21.004867553710938, |
| "learning_rate": 4.9089864153575016e-06, |
| "loss": 1.7454, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.2444113263785395, |
| "grad_norm": 31.531965255737305, |
| "learning_rate": 4.902412733964212e-06, |
| "loss": 1.5509, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.2503725782414307, |
| "grad_norm": 120.07006072998047, |
| "learning_rate": 4.895614572772916e-06, |
| "loss": 1.6287, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.2563338301043219, |
| "grad_norm": 17.399808883666992, |
| "learning_rate": 4.888592566999134e-06, |
| "loss": 1.583, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.26229508196721313, |
| "grad_norm": 17.383283615112305, |
| "learning_rate": 4.88134737277427e-06, |
| "loss": 1.6961, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.26825633383010433, |
| "grad_norm": 17.128171920776367, |
| "learning_rate": 4.873879667084301e-06, |
| "loss": 1.4649, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.27421758569299554, |
| "grad_norm": 42.8609619140625, |
| "learning_rate": 4.866190147706525e-06, |
| "loss": 1.5739, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.28017883755588674, |
| "grad_norm": 31.926063537597656, |
| "learning_rate": 4.858279533144358e-06, |
| "loss": 1.565, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.28614008941877794, |
| "grad_norm": 23.29640769958496, |
| "learning_rate": 4.8501485625602e-06, |
| "loss": 1.5657, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.29210134128166915, |
| "grad_norm": 25.43850326538086, |
| "learning_rate": 4.841797995706362e-06, |
| "loss": 1.5481, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.29806259314456035, |
| "grad_norm": 40.31404495239258, |
| "learning_rate": 4.833228612854088e-06, |
| "loss": 1.404, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.30402384500745155, |
| "grad_norm": 30.751497268676758, |
| "learning_rate": 4.824441214720629e-06, |
| "loss": 1.7065, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.30998509687034276, |
| "grad_norm": 21.12566375732422, |
| "learning_rate": 4.815436622394442e-06, |
| "loss": 1.4373, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.31594634873323396, |
| "grad_norm": 41.19392776489258, |
| "learning_rate": 4.806215677258456e-06, |
| "loss": 1.4945, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.32190760059612517, |
| "grad_norm": 32.416107177734375, |
| "learning_rate": 4.796779240911461e-06, |
| "loss": 1.4667, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.32786885245901637, |
| "grad_norm": 19.345081329345703, |
| "learning_rate": 4.787128195087596e-06, |
| "loss": 1.4775, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.33383010432190763, |
| "grad_norm": 27.58094024658203, |
| "learning_rate": 4.777263441573963e-06, |
| "loss": 1.4283, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.33979135618479883, |
| "grad_norm": 23.647533416748047, |
| "learning_rate": 4.7671859021263635e-06, |
| "loss": 1.4977, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.34575260804769004, |
| "grad_norm": 33.37125015258789, |
| "learning_rate": 4.756896518383173e-06, |
| "loss": 1.4604, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.35171385991058124, |
| "grad_norm": 24.486753463745117, |
| "learning_rate": 4.746396251777348e-06, |
| "loss": 1.5416, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.35767511177347244, |
| "grad_norm": 21.387479782104492, |
| "learning_rate": 4.7356860834466e-06, |
| "loss": 1.4861, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.36363636363636365, |
| "grad_norm": 16.769519805908203, |
| "learning_rate": 4.72476701414171e-06, |
| "loss": 1.3392, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.36959761549925485, |
| "grad_norm": 28.92209815979004, |
| "learning_rate": 4.7136400641330245e-06, |
| "loss": 1.4868, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.37555886736214605, |
| "grad_norm": 29.672182083129883, |
| "learning_rate": 4.702306273115122e-06, |
| "loss": 1.405, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.38152011922503726, |
| "grad_norm": 23.519418716430664, |
| "learning_rate": 4.690766700109659e-06, |
| "loss": 1.4502, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.38748137108792846, |
| "grad_norm": 46.69130325317383, |
| "learning_rate": 4.679022423366424e-06, |
| "loss": 1.4917, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.39344262295081966, |
| "grad_norm": 14.234804153442383, |
| "learning_rate": 4.667074540262577e-06, |
| "loss": 1.4556, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.39940387481371087, |
| "grad_norm": 50.061729431152344, |
| "learning_rate": 4.654924167200124e-06, |
| "loss": 1.5412, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.40536512667660207, |
| "grad_norm": 26.193071365356445, |
| "learning_rate": 4.6425724395015865e-06, |
| "loss": 1.4453, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.4113263785394933, |
| "grad_norm": 150.2034454345703, |
| "learning_rate": 4.63002051130393e-06, |
| "loss": 1.4456, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.4172876304023845, |
| "grad_norm": 30.46050262451172, |
| "learning_rate": 4.617269555450715e-06, |
| "loss": 1.4075, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.4232488822652757, |
| "grad_norm": 27.013071060180664, |
| "learning_rate": 4.604320763382512e-06, |
| "loss": 1.3264, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.42921013412816694, |
| "grad_norm": 29.879329681396484, |
| "learning_rate": 4.591175345025567e-06, |
| "loss": 1.3855, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.43517138599105815, |
| "grad_norm": 20.89598274230957, |
| "learning_rate": 4.5778345286787575e-06, |
| "loss": 1.4808, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.44113263785394935, |
| "grad_norm": 24.181364059448242, |
| "learning_rate": 4.56429956089881e-06, |
| "loss": 1.5116, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.44709388971684055, |
| "grad_norm": 16.009437561035156, |
| "learning_rate": 4.550571706383833e-06, |
| "loss": 1.3564, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.45305514157973176, |
| "grad_norm": 22.36256217956543, |
| "learning_rate": 4.536652247855133e-06, |
| "loss": 1.4962, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.45901639344262296, |
| "grad_norm": 20.505599975585938, |
| "learning_rate": 4.522542485937369e-06, |
| "loss": 1.3784, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.46497764530551416, |
| "grad_norm": 16.868972778320312, |
| "learning_rate": 4.508243739037016e-06, |
| "loss": 1.4388, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.47093889716840537, |
| "grad_norm": 31.546695709228516, |
| "learning_rate": 4.4937573432191766e-06, |
| "loss": 1.4437, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.47690014903129657, |
| "grad_norm": 23.937522888183594, |
| "learning_rate": 4.47908465208274e-06, |
| "loss": 1.3848, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.4828614008941878, |
| "grad_norm": 26.49115753173828, |
| "learning_rate": 4.464227036633901e-06, |
| "loss": 1.3967, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.488822652757079, |
| "grad_norm": 28.350820541381836, |
| "learning_rate": 4.449185885158056e-06, |
| "loss": 1.4275, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.4947839046199702, |
| "grad_norm": 332.1685485839844, |
| "learning_rate": 4.433962603090083e-06, |
| "loss": 1.2602, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.5007451564828614, |
| "grad_norm": 22.370702743530273, |
| "learning_rate": 4.418558612883016e-06, |
| "loss": 1.3573, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.5067064083457526, |
| "grad_norm": 17.595949172973633, |
| "learning_rate": 4.402975353875134e-06, |
| "loss": 1.304, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.5126676602086438, |
| "grad_norm": 36.13862228393555, |
| "learning_rate": 4.3872142821554695e-06, |
| "loss": 1.3921, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.518628912071535, |
| "grad_norm": 21.56857681274414, |
| "learning_rate": 4.3712768704277535e-06, |
| "loss": 1.3921, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.5245901639344263, |
| "grad_norm": 12.474554061889648, |
| "learning_rate": 4.355164607872806e-06, |
| "loss": 1.3618, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.5305514157973175, |
| "grad_norm": 19.896780014038086, |
| "learning_rate": 4.338879000009389e-06, |
| "loss": 1.2586, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.5365126676602087, |
| "grad_norm": 29.72560691833496, |
| "learning_rate": 4.322421568553529e-06, |
| "loss": 1.1915, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.5424739195230999, |
| "grad_norm": 30.1799259185791, |
| "learning_rate": 4.305793851276335e-06, |
| "loss": 1.4017, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.5484351713859911, |
| "grad_norm": 17.576276779174805, |
| "learning_rate": 4.288997401860303e-06, |
| "loss": 1.2945, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.5543964232488823, |
| "grad_norm": 13.149911880493164, |
| "learning_rate": 4.272033789754146e-06, |
| "loss": 1.2814, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.5603576751117735, |
| "grad_norm": 77.57002258300781, |
| "learning_rate": 4.254904600026143e-06, |
| "loss": 1.3343, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.5663189269746647, |
| "grad_norm": 20.16973114013672, |
| "learning_rate": 4.2376114332160325e-06, |
| "loss": 1.2908, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.5722801788375559, |
| "grad_norm": 13.326008796691895, |
| "learning_rate": 4.220155905185461e-06, |
| "loss": 1.2593, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.5782414307004471, |
| "grad_norm": 311.2110290527344, |
| "learning_rate": 4.202539646966993e-06, |
| "loss": 1.3667, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.5842026825633383, |
| "grad_norm": 15.870148658752441, |
| "learning_rate": 4.184764304611715e-06, |
| "loss": 1.4308, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.5901639344262295, |
| "grad_norm": 31.768085479736328, |
| "learning_rate": 4.166831539035423e-06, |
| "loss": 1.3856, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.5961251862891207, |
| "grad_norm": 17.089149475097656, |
| "learning_rate": 4.148743025863432e-06, |
| "loss": 1.3226, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.6020864381520119, |
| "grad_norm": 29.87621307373047, |
| "learning_rate": 4.130500455274005e-06, |
| "loss": 1.3646, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.6080476900149031, |
| "grad_norm": 42.509464263916016, |
| "learning_rate": 4.112105531840427e-06, |
| "loss": 1.2531, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.6140089418777943, |
| "grad_norm": 40.10889434814453, |
| "learning_rate": 4.093559974371725e-06, |
| "loss": 1.3317, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.6199701937406855, |
| "grad_norm": 22.96758270263672, |
| "learning_rate": 4.074865515752068e-06, |
| "loss": 1.2678, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.6259314456035767, |
| "grad_norm": 41.362876892089844, |
| "learning_rate": 4.056023902778846e-06, |
| "loss": 1.2246, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.6318926974664679, |
| "grad_norm": 17.242877960205078, |
| "learning_rate": 4.037036895999453e-06, |
| "loss": 1.3034, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.6378539493293591, |
| "grad_norm": 28.293981552124023, |
| "learning_rate": 4.017906269546778e-06, |
| "loss": 1.4299, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.6438152011922503, |
| "grad_norm": 26.767963409423828, |
| "learning_rate": 3.9986338109734354e-06, |
| "loss": 1.341, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.6497764530551415, |
| "grad_norm": 15.613986015319824, |
| "learning_rate": 3.979221321084734e-06, |
| "loss": 1.2554, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.6557377049180327, |
| "grad_norm": 20.716270446777344, |
| "learning_rate": 3.959670613770414e-06, |
| "loss": 1.3222, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.6616989567809239, |
| "grad_norm": 9.510931015014648, |
| "learning_rate": 3.939983515835157e-06, |
| "loss": 1.2529, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.6676602086438153, |
| "grad_norm": 23.321115493774414, |
| "learning_rate": 3.92016186682789e-06, |
| "loss": 1.2977, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.6736214605067065, |
| "grad_norm": 57.38838577270508, |
| "learning_rate": 3.900207518869901e-06, |
| "loss": 1.3167, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.6795827123695977, |
| "grad_norm": 22.515439987182617, |
| "learning_rate": 3.880122336481774e-06, |
| "loss": 1.2809, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.6855439642324889, |
| "grad_norm": 12.670492172241211, |
| "learning_rate": 3.859908196409177e-06, |
| "loss": 1.2513, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.6915052160953801, |
| "grad_norm": 18.576292037963867, |
| "learning_rate": 3.839566987447492e-06, |
| "loss": 1.3572, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.6974664679582713, |
| "grad_norm": 22.152652740478516, |
| "learning_rate": 3.819100610265332e-06, |
| "loss": 1.2051, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.7034277198211625, |
| "grad_norm": 12.753473281860352, |
| "learning_rate": 3.7985109772269435e-06, |
| "loss": 1.275, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.7093889716840537, |
| "grad_norm": 23.44584846496582, |
| "learning_rate": 3.777800012213514e-06, |
| "loss": 1.3305, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.7153502235469449, |
| "grad_norm": 18.46709442138672, |
| "learning_rate": 3.756969650443408e-06, |
| "loss": 1.2177, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.7213114754098361, |
| "grad_norm": 18.8616886138916, |
| "learning_rate": 3.7360218382913426e-06, |
| "loss": 1.3302, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.7272727272727273, |
| "grad_norm": 39.76200485229492, |
| "learning_rate": 3.714958533106515e-06, |
| "loss": 1.3266, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.7332339791356185, |
| "grad_norm": 17.528858184814453, |
| "learning_rate": 3.6937817030297164e-06, |
| "loss": 1.2974, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.7391952309985097, |
| "grad_norm": 29.327545166015625, |
| "learning_rate": 3.672493326809422e-06, |
| "loss": 1.2878, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.7451564828614009, |
| "grad_norm": 14.248793601989746, |
| "learning_rate": 3.651095393616904e-06, |
| "loss": 1.2808, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.7511177347242921, |
| "grad_norm": 14.539497375488281, |
| "learning_rate": 3.629589902860363e-06, |
| "loss": 1.2453, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.7570789865871833, |
| "grad_norm": 15.273882865905762, |
| "learning_rate": 3.607978863998104e-06, |
| "loss": 1.3493, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.7630402384500745, |
| "grad_norm": 14.30077075958252, |
| "learning_rate": 3.586264296350775e-06, |
| "loss": 1.2065, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.7690014903129657, |
| "grad_norm": 25.420032501220703, |
| "learning_rate": 3.564448228912682e-06, |
| "loss": 1.3278, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.7749627421758569, |
| "grad_norm": 18.863452911376953, |
| "learning_rate": 3.5425327001622034e-06, |
| "loss": 1.1876, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.7809239940387481, |
| "grad_norm": 18.1572322845459, |
| "learning_rate": 3.520519757871313e-06, |
| "loss": 1.2363, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.7868852459016393, |
| "grad_norm": 21.26305389404297, |
| "learning_rate": 3.4984114589142388e-06, |
| "loss": 1.2117, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.7928464977645305, |
| "grad_norm": 34.8569221496582, |
| "learning_rate": 3.476209869075273e-06, |
| "loss": 1.2962, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.7988077496274217, |
| "grad_norm": 11.790558815002441, |
| "learning_rate": 3.4539170628557383e-06, |
| "loss": 1.2112, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.8047690014903129, |
| "grad_norm": 71.28162384033203, |
| "learning_rate": 3.4315351232801597e-06, |
| "loss": 1.1849, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.8107302533532041, |
| "grad_norm": 17.917343139648438, |
| "learning_rate": 3.409066141701618e-06, |
| "loss": 1.2936, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.8166915052160953, |
| "grad_norm": 14.670162200927734, |
| "learning_rate": 3.386512217606339e-06, |
| "loss": 1.3622, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.8226527570789866, |
| "grad_norm": 135.63331604003906, |
| "learning_rate": 3.3638754584175222e-06, |
| "loss": 1.2687, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.8286140089418778, |
| "grad_norm": 20.351055145263672, |
| "learning_rate": 3.3411579792984178e-06, |
| "loss": 1.2849, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.834575260804769, |
| "grad_norm": 15.531908988952637, |
| "learning_rate": 3.318361902954692e-06, |
| "loss": 1.2378, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.8405365126676602, |
| "grad_norm": 92.7786865234375, |
| "learning_rate": 3.295489359436083e-06, |
| "loss": 1.3383, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.8464977645305514, |
| "grad_norm": 17.087692260742188, |
| "learning_rate": 3.272542485937369e-06, |
| "loss": 1.2544, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.8524590163934426, |
| "grad_norm": 21.718421936035156, |
| "learning_rate": 3.249523426598669e-06, |
| "loss": 1.2632, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.8584202682563339, |
| "grad_norm": 15.318682670593262, |
| "learning_rate": 3.2264343323050985e-06, |
| "loss": 1.1569, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.8643815201192251, |
| "grad_norm": 23.35086441040039, |
| "learning_rate": 3.2032773604857915e-06, |
| "loss": 1.1956, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.8703427719821163, |
| "grad_norm": 40.38860321044922, |
| "learning_rate": 3.1800546749123108e-06, |
| "loss": 1.2296, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.8763040238450075, |
| "grad_norm": 90.90715026855469, |
| "learning_rate": 3.1567684454964674e-06, |
| "loss": 1.1541, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.8822652757078987, |
| "grad_norm": 21.368518829345703, |
| "learning_rate": 3.133420848087566e-06, |
| "loss": 1.3271, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.8882265275707899, |
| "grad_norm": 88.14790344238281, |
| "learning_rate": 3.110014064269094e-06, |
| "loss": 1.2658, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.8941877794336811, |
| "grad_norm": 21.05089569091797, |
| "learning_rate": 3.0865502811548755e-06, |
| "loss": 1.1987, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.9001490312965723, |
| "grad_norm": 16.833553314208984, |
| "learning_rate": 3.0630316911847112e-06, |
| "loss": 1.2963, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.9061102831594635, |
| "grad_norm": 13.26325511932373, |
| "learning_rate": 3.039460491919516e-06, |
| "loss": 1.2032, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.9120715350223547, |
| "grad_norm": 20.50226402282715, |
| "learning_rate": 3.015838885835981e-06, |
| "loss": 1.1907, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.9180327868852459, |
| "grad_norm": 14.766148567199707, |
| "learning_rate": 2.992169080120776e-06, |
| "loss": 1.2646, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.9239940387481371, |
| "grad_norm": 25.122909545898438, |
| "learning_rate": 2.9684532864643123e-06, |
| "loss": 1.256, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.9299552906110283, |
| "grad_norm": 26.095359802246094, |
| "learning_rate": 2.944693720854081e-06, |
| "loss": 1.2392, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.9359165424739195, |
| "grad_norm": 9.14566421508789, |
| "learning_rate": 2.920892603367596e-06, |
| "loss": 1.1698, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.9418777943368107, |
| "grad_norm": 28.01599884033203, |
| "learning_rate": 2.897052157964952e-06, |
| "loss": 1.2073, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.9478390461997019, |
| "grad_norm": 27.741567611694336, |
| "learning_rate": 2.8731746122810105e-06, |
| "loss": 1.243, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.9538002980625931, |
| "grad_norm": 14.678361892700195, |
| "learning_rate": 2.8492621974172653e-06, |
| "loss": 1.1756, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.9597615499254843, |
| "grad_norm": 15.269275665283203, |
| "learning_rate": 2.8253171477333585e-06, |
| "loss": 1.1728, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.9657228017883756, |
| "grad_norm": 64.93920135498047, |
| "learning_rate": 2.8013417006383078e-06, |
| "loss": 1.2727, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.9716840536512668, |
| "grad_norm": 15.335084915161133, |
| "learning_rate": 2.7773380963814454e-06, |
| "loss": 1.2749, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.977645305514158, |
| "grad_norm": 37.09837341308594, |
| "learning_rate": 2.7533085778430884e-06, |
| "loss": 1.1588, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.9836065573770492, |
| "grad_norm": 130.5791778564453, |
| "learning_rate": 2.729255390324966e-06, |
| "loss": 1.2001, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.9895678092399404, |
| "grad_norm": 50.33445358276367, |
| "learning_rate": 2.7051807813404213e-06, |
| "loss": 1.2772, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.9955290611028316, |
| "grad_norm": 32.990840911865234, |
| "learning_rate": 2.6810870004044065e-06, |
| "loss": 1.2903, |
| "step": 167 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 26.173009872436523, |
| "learning_rate": 2.6569762988232838e-06, |
| "loss": 0.8979, |
| "step": 168 |
| }, |
| { |
| "epoch": 1.0059612518628913, |
| "grad_norm": 12.552504539489746, |
| "learning_rate": 2.632850929484472e-06, |
| "loss": 1.0755, |
| "step": 169 |
| }, |
| { |
| "epoch": 1.0119225037257824, |
| "grad_norm": 17.015302658081055, |
| "learning_rate": 2.6087131466459344e-06, |
| "loss": 1.2362, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.0178837555886737, |
| "grad_norm": 49.570556640625, |
| "learning_rate": 2.5845652057255414e-06, |
| "loss": 1.2128, |
| "step": 171 |
| }, |
| { |
| "epoch": 1.0238450074515648, |
| "grad_norm": 42.604183197021484, |
| "learning_rate": 2.560409363090331e-06, |
| "loss": 1.1702, |
| "step": 172 |
| }, |
| { |
| "epoch": 1.0298062593144561, |
| "grad_norm": 20.264986038208008, |
| "learning_rate": 2.536247875845669e-06, |
| "loss": 1.1781, |
| "step": 173 |
| }, |
| { |
| "epoch": 1.0357675111773472, |
| "grad_norm": 12.800230979919434, |
| "learning_rate": 2.5120830016243515e-06, |
| "loss": 1.1768, |
| "step": 174 |
| }, |
| { |
| "epoch": 1.0417287630402385, |
| "grad_norm": 23.823047637939453, |
| "learning_rate": 2.4879169983756498e-06, |
| "loss": 1.2566, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.0476900149031296, |
| "grad_norm": 21.49330711364746, |
| "learning_rate": 2.4637521241543315e-06, |
| "loss": 1.084, |
| "step": 176 |
| }, |
| { |
| "epoch": 1.053651266766021, |
| "grad_norm": 23.708070755004883, |
| "learning_rate": 2.43959063690967e-06, |
| "loss": 1.1973, |
| "step": 177 |
| }, |
| { |
| "epoch": 1.059612518628912, |
| "grad_norm": 14.244556427001953, |
| "learning_rate": 2.415434794274459e-06, |
| "loss": 1.1781, |
| "step": 178 |
| }, |
| { |
| "epoch": 1.0655737704918034, |
| "grad_norm": 18.469478607177734, |
| "learning_rate": 2.3912868533540665e-06, |
| "loss": 1.0891, |
| "step": 179 |
| }, |
| { |
| "epoch": 1.0715350223546944, |
| "grad_norm": 13.801828384399414, |
| "learning_rate": 2.3671490705155285e-06, |
| "loss": 1.1838, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.0774962742175858, |
| "grad_norm": 97.42760467529297, |
| "learning_rate": 2.3430237011767166e-06, |
| "loss": 1.1887, |
| "step": 181 |
| }, |
| { |
| "epoch": 1.0834575260804769, |
| "grad_norm": 22.47690200805664, |
| "learning_rate": 2.3189129995955944e-06, |
| "loss": 1.2309, |
| "step": 182 |
| }, |
| { |
| "epoch": 1.0894187779433682, |
| "grad_norm": 82.70649719238281, |
| "learning_rate": 2.2948192186595787e-06, |
| "loss": 1.2702, |
| "step": 183 |
| }, |
| { |
| "epoch": 1.0953800298062593, |
| "grad_norm": 143.4379425048828, |
| "learning_rate": 2.2707446096750345e-06, |
| "loss": 1.2171, |
| "step": 184 |
| }, |
| { |
| "epoch": 1.1013412816691506, |
| "grad_norm": 52.544227600097656, |
| "learning_rate": 2.246691422156913e-06, |
| "loss": 1.2009, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.1073025335320417, |
| "grad_norm": 41.363922119140625, |
| "learning_rate": 2.222661903618556e-06, |
| "loss": 1.1588, |
| "step": 186 |
| }, |
| { |
| "epoch": 1.113263785394933, |
| "grad_norm": 12.45494556427002, |
| "learning_rate": 2.1986582993616926e-06, |
| "loss": 1.2078, |
| "step": 187 |
| }, |
| { |
| "epoch": 1.119225037257824, |
| "grad_norm": 15.55370807647705, |
| "learning_rate": 2.1746828522666423e-06, |
| "loss": 1.1447, |
| "step": 188 |
| }, |
| { |
| "epoch": 1.1251862891207154, |
| "grad_norm": 16.289379119873047, |
| "learning_rate": 2.1507378025827355e-06, |
| "loss": 1.2212, |
| "step": 189 |
| }, |
| { |
| "epoch": 1.1311475409836065, |
| "grad_norm": 20.157041549682617, |
| "learning_rate": 2.12682538771899e-06, |
| "loss": 1.0832, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.1371087928464978, |
| "grad_norm": 43.083984375, |
| "learning_rate": 2.1029478420350493e-06, |
| "loss": 1.1558, |
| "step": 191 |
| }, |
| { |
| "epoch": 1.1430700447093889, |
| "grad_norm": 43.636539459228516, |
| "learning_rate": 2.079107396632404e-06, |
| "loss": 1.2275, |
| "step": 192 |
| }, |
| { |
| "epoch": 1.1490312965722802, |
| "grad_norm": 13.149125099182129, |
| "learning_rate": 2.0553062791459193e-06, |
| "loss": 1.0831, |
| "step": 193 |
| }, |
| { |
| "epoch": 1.1549925484351713, |
| "grad_norm": 17.64810562133789, |
| "learning_rate": 2.031546713535688e-06, |
| "loss": 1.2873, |
| "step": 194 |
| }, |
| { |
| "epoch": 1.1609538002980626, |
| "grad_norm": 23.816091537475586, |
| "learning_rate": 2.007830919879225e-06, |
| "loss": 1.1185, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.1669150521609537, |
| "grad_norm": 23.4075984954834, |
| "learning_rate": 1.9841611141640205e-06, |
| "loss": 1.1648, |
| "step": 196 |
| }, |
| { |
| "epoch": 1.172876304023845, |
| "grad_norm": 75.27896881103516, |
| "learning_rate": 1.960539508080485e-06, |
| "loss": 1.1569, |
| "step": 197 |
| }, |
| { |
| "epoch": 1.1788375558867363, |
| "grad_norm": 27.79124641418457, |
| "learning_rate": 1.936968308815289e-06, |
| "loss": 1.1506, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.1847988077496274, |
| "grad_norm": 88.08030700683594, |
| "learning_rate": 1.913449718845125e-06, |
| "loss": 1.1021, |
| "step": 199 |
| }, |
| { |
| "epoch": 1.1907600596125185, |
| "grad_norm": 20.72780990600586, |
| "learning_rate": 1.8899859357309064e-06, |
| "loss": 1.1408, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.1967213114754098, |
| "grad_norm": 34.17177963256836, |
| "learning_rate": 1.8665791519124344e-06, |
| "loss": 1.2407, |
| "step": 201 |
| }, |
| { |
| "epoch": 1.2026825633383011, |
| "grad_norm": 16.424531936645508, |
| "learning_rate": 1.8432315545035328e-06, |
| "loss": 1.2134, |
| "step": 202 |
| }, |
| { |
| "epoch": 1.2086438152011922, |
| "grad_norm": 68.55332946777344, |
| "learning_rate": 1.8199453250876894e-06, |
| "loss": 1.1283, |
| "step": 203 |
| }, |
| { |
| "epoch": 1.2146050670640836, |
| "grad_norm": 18.79085922241211, |
| "learning_rate": 1.796722639514209e-06, |
| "loss": 1.1635, |
| "step": 204 |
| }, |
| { |
| "epoch": 1.2205663189269746, |
| "grad_norm": 18.540996551513672, |
| "learning_rate": 1.7735656676949028e-06, |
| "loss": 1.304, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.226527570789866, |
| "grad_norm": 24.513444900512695, |
| "learning_rate": 1.7504765734013323e-06, |
| "loss": 1.2594, |
| "step": 206 |
| }, |
| { |
| "epoch": 1.232488822652757, |
| "grad_norm": 15.741020202636719, |
| "learning_rate": 1.7274575140626318e-06, |
| "loss": 1.1887, |
| "step": 207 |
| }, |
| { |
| "epoch": 1.2384500745156484, |
| "grad_norm": 14.231233596801758, |
| "learning_rate": 1.7045106405639175e-06, |
| "loss": 1.2081, |
| "step": 208 |
| }, |
| { |
| "epoch": 1.2444113263785395, |
| "grad_norm": 10.00545883178711, |
| "learning_rate": 1.6816380970453084e-06, |
| "loss": 1.1334, |
| "step": 209 |
| }, |
| { |
| "epoch": 1.2503725782414308, |
| "grad_norm": 39.42339324951172, |
| "learning_rate": 1.6588420207015826e-06, |
| "loss": 1.2011, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.2563338301043219, |
| "grad_norm": 12.922039985656738, |
| "learning_rate": 1.6361245415824784e-06, |
| "loss": 1.1481, |
| "step": 211 |
| }, |
| { |
| "epoch": 1.2622950819672132, |
| "grad_norm": 11.270161628723145, |
| "learning_rate": 1.613487782393661e-06, |
| "loss": 1.2256, |
| "step": 212 |
| }, |
| { |
| "epoch": 1.2682563338301043, |
| "grad_norm": 47.40531921386719, |
| "learning_rate": 1.5909338582983825e-06, |
| "loss": 1.1946, |
| "step": 213 |
| }, |
| { |
| "epoch": 1.2742175856929956, |
| "grad_norm": 22.412290573120117, |
| "learning_rate": 1.5684648767198412e-06, |
| "loss": 1.1862, |
| "step": 214 |
| }, |
| { |
| "epoch": 1.2801788375558867, |
| "grad_norm": 13.300796508789062, |
| "learning_rate": 1.5460829371442626e-06, |
| "loss": 1.2043, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.286140089418778, |
| "grad_norm": 30.526369094848633, |
| "learning_rate": 1.5237901309247282e-06, |
| "loss": 1.1886, |
| "step": 216 |
| }, |
| { |
| "epoch": 1.292101341281669, |
| "grad_norm": 28.1364803314209, |
| "learning_rate": 1.5015885410857617e-06, |
| "loss": 1.1393, |
| "step": 217 |
| }, |
| { |
| "epoch": 1.2980625931445604, |
| "grad_norm": 16.774173736572266, |
| "learning_rate": 1.4794802421286881e-06, |
| "loss": 1.0952, |
| "step": 218 |
| }, |
| { |
| "epoch": 1.3040238450074515, |
| "grad_norm": 11.247906684875488, |
| "learning_rate": 1.457467299837797e-06, |
| "loss": 1.118, |
| "step": 219 |
| }, |
| { |
| "epoch": 1.3099850968703428, |
| "grad_norm": 27.284690856933594, |
| "learning_rate": 1.4355517710873184e-06, |
| "loss": 1.2864, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.315946348733234, |
| "grad_norm": 16.30120849609375, |
| "learning_rate": 1.4137357036492255e-06, |
| "loss": 1.1174, |
| "step": 221 |
| }, |
| { |
| "epoch": 1.3219076005961252, |
| "grad_norm": 21.61345672607422, |
| "learning_rate": 1.3920211360018971e-06, |
| "loss": 1.2607, |
| "step": 222 |
| }, |
| { |
| "epoch": 1.3278688524590163, |
| "grad_norm": 82.09076690673828, |
| "learning_rate": 1.3704100971396378e-06, |
| "loss": 1.0947, |
| "step": 223 |
| }, |
| { |
| "epoch": 1.3338301043219076, |
| "grad_norm": 28.745494842529297, |
| "learning_rate": 1.3489046063830974e-06, |
| "loss": 1.1263, |
| "step": 224 |
| }, |
| { |
| "epoch": 1.339791356184799, |
| "grad_norm": 20.771093368530273, |
| "learning_rate": 1.327506673190579e-06, |
| "loss": 1.163, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.34575260804769, |
| "grad_norm": 16.300779342651367, |
| "learning_rate": 1.306218296970284e-06, |
| "loss": 1.15, |
| "step": 226 |
| }, |
| { |
| "epoch": 1.3517138599105811, |
| "grad_norm": 16.083730697631836, |
| "learning_rate": 1.285041466893485e-06, |
| "loss": 1.2341, |
| "step": 227 |
| }, |
| { |
| "epoch": 1.3576751117734724, |
| "grad_norm": 70.427978515625, |
| "learning_rate": 1.2639781617086589e-06, |
| "loss": 1.0874, |
| "step": 228 |
| }, |
| { |
| "epoch": 1.3636363636363638, |
| "grad_norm": 16.239727020263672, |
| "learning_rate": 1.2430303495565928e-06, |
| "loss": 1.1787, |
| "step": 229 |
| }, |
| { |
| "epoch": 1.3695976154992549, |
| "grad_norm": 20.107654571533203, |
| "learning_rate": 1.222199987786487e-06, |
| "loss": 1.0678, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.375558867362146, |
| "grad_norm": 18.42952537536621, |
| "learning_rate": 1.201489022773057e-06, |
| "loss": 1.1223, |
| "step": 231 |
| }, |
| { |
| "epoch": 1.3815201192250373, |
| "grad_norm": 20.96135711669922, |
| "learning_rate": 1.1808993897346679e-06, |
| "loss": 1.1385, |
| "step": 232 |
| }, |
| { |
| "epoch": 1.3874813710879286, |
| "grad_norm": 35.41992950439453, |
| "learning_rate": 1.160433012552508e-06, |
| "loss": 1.1785, |
| "step": 233 |
| }, |
| { |
| "epoch": 1.3934426229508197, |
| "grad_norm": 15.66014575958252, |
| "learning_rate": 1.1400918035908238e-06, |
| "loss": 1.1337, |
| "step": 234 |
| }, |
| { |
| "epoch": 1.3994038748137108, |
| "grad_norm": 11.301997184753418, |
| "learning_rate": 1.1198776635182273e-06, |
| "loss": 1.1967, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.405365126676602, |
| "grad_norm": 20.638965606689453, |
| "learning_rate": 1.0997924811301008e-06, |
| "loss": 1.2153, |
| "step": 236 |
| }, |
| { |
| "epoch": 1.4113263785394934, |
| "grad_norm": 28.17490005493164, |
| "learning_rate": 1.079838133172111e-06, |
| "loss": 1.1678, |
| "step": 237 |
| }, |
| { |
| "epoch": 1.4172876304023845, |
| "grad_norm": 23.852008819580078, |
| "learning_rate": 1.0600164841648435e-06, |
| "loss": 1.1183, |
| "step": 238 |
| }, |
| { |
| "epoch": 1.4232488822652756, |
| "grad_norm": 15.352867126464844, |
| "learning_rate": 1.0403293862295863e-06, |
| "loss": 1.2151, |
| "step": 239 |
| }, |
| { |
| "epoch": 1.4292101341281669, |
| "grad_norm": 26.934675216674805, |
| "learning_rate": 1.0207786789152672e-06, |
| "loss": 1.1222, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.4351713859910582, |
| "grad_norm": 10.703940391540527, |
| "learning_rate": 1.0013661890265656e-06, |
| "loss": 1.0719, |
| "step": 241 |
| }, |
| { |
| "epoch": 1.4411326378539493, |
| "grad_norm": 12.302316665649414, |
| "learning_rate": 9.820937304532221e-07, |
| "loss": 1.2103, |
| "step": 242 |
| }, |
| { |
| "epoch": 1.4470938897168406, |
| "grad_norm": 16.27391815185547, |
| "learning_rate": 9.629631040005469e-07, |
| "loss": 1.1064, |
| "step": 243 |
| }, |
| { |
| "epoch": 1.4530551415797317, |
| "grad_norm": 16.081787109375, |
| "learning_rate": 9.439760972211545e-07, |
| "loss": 1.1514, |
| "step": 244 |
| }, |
| { |
| "epoch": 1.459016393442623, |
| "grad_norm": 16.725788116455078, |
| "learning_rate": 9.251344842479332e-07, |
| "loss": 1.1508, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.464977645305514, |
| "grad_norm": 43.19313049316406, |
| "learning_rate": 9.064400256282757e-07, |
| "loss": 1.205, |
| "step": 246 |
| }, |
| { |
| "epoch": 1.4709388971684054, |
| "grad_norm": 44.95701599121094, |
| "learning_rate": 8.878944681595742e-07, |
| "loss": 1.1407, |
| "step": 247 |
| }, |
| { |
| "epoch": 1.4769001490312965, |
| "grad_norm": 16.098857879638672, |
| "learning_rate": 8.694995447259955e-07, |
| "loss": 1.1705, |
| "step": 248 |
| }, |
| { |
| "epoch": 1.4828614008941878, |
| "grad_norm": 21.242740631103516, |
| "learning_rate": 8.512569741365692e-07, |
| "loss": 1.1271, |
| "step": 249 |
| }, |
| { |
| "epoch": 1.488822652757079, |
| "grad_norm": 44.7149772644043, |
| "learning_rate": 8.331684609645779e-07, |
| "loss": 1.1659, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.4947839046199702, |
| "grad_norm": 27.816669464111328, |
| "learning_rate": 8.152356953882857e-07, |
| "loss": 1.1576, |
| "step": 251 |
| }, |
| { |
| "epoch": 1.5007451564828616, |
| "grad_norm": 28.994232177734375, |
| "learning_rate": 7.974603530330069e-07, |
| "loss": 1.104, |
| "step": 252 |
| }, |
| { |
| "epoch": 1.5067064083457526, |
| "grad_norm": 277.7655334472656, |
| "learning_rate": 7.7984409481454e-07, |
| "loss": 1.1457, |
| "step": 253 |
| }, |
| { |
| "epoch": 1.5126676602086437, |
| "grad_norm": 21.334651947021484, |
| "learning_rate": 7.623885667839686e-07, |
| "loss": 1.0349, |
| "step": 254 |
| }, |
| { |
| "epoch": 1.518628912071535, |
| "grad_norm": 29.977994918823242, |
| "learning_rate": 7.450953999738584e-07, |
| "loss": 1.1867, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.5245901639344264, |
| "grad_norm": 13.20008659362793, |
| "learning_rate": 7.279662102458551e-07, |
| "loss": 1.1297, |
| "step": 256 |
| }, |
| { |
| "epoch": 1.5305514157973175, |
| "grad_norm": 22.25104522705078, |
| "learning_rate": 7.110025981396976e-07, |
| "loss": 1.1136, |
| "step": 257 |
| }, |
| { |
| "epoch": 1.5365126676602086, |
| "grad_norm": 75.48139190673828, |
| "learning_rate": 6.942061487236654e-07, |
| "loss": 1.1347, |
| "step": 258 |
| }, |
| { |
| "epoch": 1.5424739195230999, |
| "grad_norm": 13.588129997253418, |
| "learning_rate": 6.775784314464717e-07, |
| "loss": 1.1234, |
| "step": 259 |
| }, |
| { |
| "epoch": 1.5484351713859912, |
| "grad_norm": 17.303014755249023, |
| "learning_rate": 6.611209999906124e-07, |
| "loss": 1.1218, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.5543964232488823, |
| "grad_norm": 13.967643737792969, |
| "learning_rate": 6.448353921271949e-07, |
| "loss": 1.297, |
| "step": 261 |
| }, |
| { |
| "epoch": 1.5603576751117734, |
| "grad_norm": 14.436738014221191, |
| "learning_rate": 6.28723129572247e-07, |
| "loss": 1.1563, |
| "step": 262 |
| }, |
| { |
| "epoch": 1.5663189269746647, |
| "grad_norm": 18.954809188842773, |
| "learning_rate": 6.12785717844531e-07, |
| "loss": 1.1938, |
| "step": 263 |
| }, |
| { |
| "epoch": 1.572280178837556, |
| "grad_norm": 13.521712303161621, |
| "learning_rate": 5.970246461248668e-07, |
| "loss": 1.1681, |
| "step": 264 |
| }, |
| { |
| "epoch": 1.578241430700447, |
| "grad_norm": 22.596717834472656, |
| "learning_rate": 5.814413871169844e-07, |
| "loss": 1.1362, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.5842026825633382, |
| "grad_norm": 23.537677764892578, |
| "learning_rate": 5.660373969099178e-07, |
| "loss": 1.249, |
| "step": 266 |
| }, |
| { |
| "epoch": 1.5901639344262295, |
| "grad_norm": 41.0632209777832, |
| "learning_rate": 5.508141148419443e-07, |
| "loss": 1.2773, |
| "step": 267 |
| }, |
| { |
| "epoch": 1.5961251862891208, |
| "grad_norm": 11.993062973022461, |
| "learning_rate": 5.357729633660999e-07, |
| "loss": 1.0553, |
| "step": 268 |
| }, |
| { |
| "epoch": 1.602086438152012, |
| "grad_norm": 16.83928871154785, |
| "learning_rate": 5.209153479172607e-07, |
| "loss": 1.19, |
| "step": 269 |
| }, |
| { |
| "epoch": 1.608047690014903, |
| "grad_norm": 13.438224792480469, |
| "learning_rate": 5.062426567808237e-07, |
| "loss": 1.1166, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.6140089418777943, |
| "grad_norm": 12.740294456481934, |
| "learning_rate": 4.917562609629847e-07, |
| "loss": 1.0806, |
| "step": 271 |
| }, |
| { |
| "epoch": 1.6199701937406856, |
| "grad_norm": 40.68648910522461, |
| "learning_rate": 4.774575140626317e-07, |
| "loss": 1.278, |
| "step": 272 |
| }, |
| { |
| "epoch": 1.6259314456035767, |
| "grad_norm": 45.1893310546875, |
| "learning_rate": 4.6334775214486786e-07, |
| "loss": 1.1741, |
| "step": 273 |
| }, |
| { |
| "epoch": 1.6318926974664678, |
| "grad_norm": 11.675634384155273, |
| "learning_rate": 4.494282936161681e-07, |
| "loss": 1.1918, |
| "step": 274 |
| }, |
| { |
| "epoch": 1.6378539493293591, |
| "grad_norm": 18.91298484802246, |
| "learning_rate": 4.3570043910118986e-07, |
| "loss": 1.2231, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.6438152011922504, |
| "grad_norm": 14.937652587890625, |
| "learning_rate": 4.221654713212431e-07, |
| "loss": 1.1785, |
| "step": 276 |
| }, |
| { |
| "epoch": 1.6497764530551415, |
| "grad_norm": 12.2354097366333, |
| "learning_rate": 4.088246549744332e-07, |
| "loss": 1.0918, |
| "step": 277 |
| }, |
| { |
| "epoch": 1.6557377049180326, |
| "grad_norm": 17.386564254760742, |
| "learning_rate": 3.956792366174894e-07, |
| "loss": 1.1152, |
| "step": 278 |
| }, |
| { |
| "epoch": 1.661698956780924, |
| "grad_norm": 13.0751314163208, |
| "learning_rate": 3.8273044454928547e-07, |
| "loss": 1.1894, |
| "step": 279 |
| }, |
| { |
| "epoch": 1.6676602086438153, |
| "grad_norm": 40.181907653808594, |
| "learning_rate": 3.699794886960706e-07, |
| "loss": 1.206, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.6736214605067063, |
| "grad_norm": 16.61675453186035, |
| "learning_rate": 3.5742756049841397e-07, |
| "loss": 1.1038, |
| "step": 281 |
| }, |
| { |
| "epoch": 1.6795827123695977, |
| "grad_norm": 20.62757682800293, |
| "learning_rate": 3.450758327998768e-07, |
| "loss": 1.12, |
| "step": 282 |
| }, |
| { |
| "epoch": 1.685543964232489, |
| "grad_norm": 11.517589569091797, |
| "learning_rate": 3.329254597374232e-07, |
| "loss": 1.1531, |
| "step": 283 |
| }, |
| { |
| "epoch": 1.69150521609538, |
| "grad_norm": 15.695626258850098, |
| "learning_rate": 3.209775766335771e-07, |
| "loss": 1.1176, |
| "step": 284 |
| }, |
| { |
| "epoch": 1.6974664679582712, |
| "grad_norm": 13.518576622009277, |
| "learning_rate": 3.092332998903416e-07, |
| "loss": 1.1052, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.7034277198211625, |
| "grad_norm": 26.3131160736084, |
| "learning_rate": 2.976937268848787e-07, |
| "loss": 1.1914, |
| "step": 286 |
| }, |
| { |
| "epoch": 1.7093889716840538, |
| "grad_norm": 8.992045402526855, |
| "learning_rate": 2.8635993586697555e-07, |
| "loss": 1.1349, |
| "step": 287 |
| }, |
| { |
| "epoch": 1.7153502235469449, |
| "grad_norm": 15.58350944519043, |
| "learning_rate": 2.752329858582906e-07, |
| "loss": 1.1917, |
| "step": 288 |
| }, |
| { |
| "epoch": 1.721311475409836, |
| "grad_norm": 19.328996658325195, |
| "learning_rate": 2.643139165534009e-07, |
| "loss": 1.1012, |
| "step": 289 |
| }, |
| { |
| "epoch": 1.7272727272727273, |
| "grad_norm": 53.852378845214844, |
| "learning_rate": 2.5360374822265276e-07, |
| "loss": 1.1354, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.7332339791356186, |
| "grad_norm": 21.8743896484375, |
| "learning_rate": 2.431034816168279e-07, |
| "loss": 1.2384, |
| "step": 291 |
| }, |
| { |
| "epoch": 1.7391952309985097, |
| "grad_norm": 17.44244956970215, |
| "learning_rate": 2.3281409787363652e-07, |
| "loss": 1.1438, |
| "step": 292 |
| }, |
| { |
| "epoch": 1.7451564828614008, |
| "grad_norm": 11.067893981933594, |
| "learning_rate": 2.227365584260377e-07, |
| "loss": 1.0889, |
| "step": 293 |
| }, |
| { |
| "epoch": 1.751117734724292, |
| "grad_norm": 28.883140563964844, |
| "learning_rate": 2.1287180491240455e-07, |
| "loss": 1.0806, |
| "step": 294 |
| }, |
| { |
| "epoch": 1.7570789865871834, |
| "grad_norm": 15.901183128356934, |
| "learning_rate": 2.0322075908853934e-07, |
| "loss": 1.1334, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.7630402384500745, |
| "grad_norm": 36.24775695800781, |
| "learning_rate": 1.9378432274154424e-07, |
| "loss": 1.2407, |
| "step": 296 |
| }, |
| { |
| "epoch": 1.7690014903129656, |
| "grad_norm": 20.610580444335938, |
| "learning_rate": 1.8456337760555915e-07, |
| "loss": 1.2004, |
| "step": 297 |
| }, |
| { |
| "epoch": 1.774962742175857, |
| "grad_norm": 25.599275588989258, |
| "learning_rate": 1.7555878527937164e-07, |
| "loss": 1.2447, |
| "step": 298 |
| }, |
| { |
| "epoch": 1.7809239940387482, |
| "grad_norm": 27.45682716369629, |
| "learning_rate": 1.6677138714591313e-07, |
| "loss": 1.1359, |
| "step": 299 |
| }, |
| { |
| "epoch": 1.7868852459016393, |
| "grad_norm": 9.919366836547852, |
| "learning_rate": 1.5820200429363775e-07, |
| "loss": 1.1541, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.7928464977645304, |
| "grad_norm": 25.252059936523438, |
| "learning_rate": 1.498514374398008e-07, |
| "loss": 1.1065, |
| "step": 301 |
| }, |
| { |
| "epoch": 1.7988077496274217, |
| "grad_norm": 10.612046241760254, |
| "learning_rate": 1.417204668556421e-07, |
| "loss": 1.0322, |
| "step": 302 |
| }, |
| { |
| "epoch": 1.804769001490313, |
| "grad_norm": 29.110383987426758, |
| "learning_rate": 1.3380985229347555e-07, |
| "loss": 1.0643, |
| "step": 303 |
| }, |
| { |
| "epoch": 1.8107302533532041, |
| "grad_norm": 28.44003677368164, |
| "learning_rate": 1.2612033291569985e-07, |
| "loss": 1.197, |
| "step": 304 |
| }, |
| { |
| "epoch": 1.8166915052160952, |
| "grad_norm": 16.720949172973633, |
| "learning_rate": 1.1865262722573073e-07, |
| "loss": 1.1588, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.8226527570789866, |
| "grad_norm": 24.2500057220459, |
| "learning_rate": 1.1140743300086603e-07, |
| "loss": 1.1925, |
| "step": 306 |
| }, |
| { |
| "epoch": 1.8286140089418779, |
| "grad_norm": 29.950042724609375, |
| "learning_rate": 1.0438542722708444e-07, |
| "loss": 1.2223, |
| "step": 307 |
| }, |
| { |
| "epoch": 1.834575260804769, |
| "grad_norm": 13.418863296508789, |
| "learning_rate": 9.758726603578932e-08, |
| "loss": 1.0422, |
| "step": 308 |
| }, |
| { |
| "epoch": 1.84053651266766, |
| "grad_norm": 31.109628677368164, |
| "learning_rate": 9.101358464249921e-08, |
| "loss": 1.1159, |
| "step": 309 |
| }, |
| { |
| "epoch": 1.8464977645305514, |
| "grad_norm": 13.534027099609375, |
| "learning_rate": 8.466499728749411e-08, |
| "loss": 1.2084, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.8524590163934427, |
| "grad_norm": 17.113624572753906, |
| "learning_rate": 7.854209717842231e-08, |
| "loss": 1.125, |
| "step": 311 |
| }, |
| { |
| "epoch": 1.8584202682563338, |
| "grad_norm": 11.193347930908203, |
| "learning_rate": 7.264545643486997e-08, |
| "loss": 1.03, |
| "step": 312 |
| }, |
| { |
| "epoch": 1.864381520119225, |
| "grad_norm": 90.95343017578125, |
| "learning_rate": 6.697562603490387e-08, |
| "loss": 1.1461, |
| "step": 313 |
| }, |
| { |
| "epoch": 1.8703427719821164, |
| "grad_norm": 9.864035606384277, |
| "learning_rate": 6.153313576358705e-08, |
| "loss": 1.1409, |
| "step": 314 |
| }, |
| { |
| "epoch": 1.8763040238450075, |
| "grad_norm": 9.691558837890625, |
| "learning_rate": 5.6318494163477564e-08, |
| "loss": 1.2217, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.8822652757078986, |
| "grad_norm": 14.702143669128418, |
| "learning_rate": 5.133218848711013e-08, |
| "loss": 1.0934, |
| "step": 316 |
| }, |
| { |
| "epoch": 1.88822652757079, |
| "grad_norm": 17.254133224487305, |
| "learning_rate": 4.657468465146642e-08, |
| "loss": 1.1534, |
| "step": 317 |
| }, |
| { |
| "epoch": 1.8941877794336812, |
| "grad_norm": 60.02861022949219, |
| "learning_rate": 4.20464271944418e-08, |
| "loss": 1.2898, |
| "step": 318 |
| }, |
| { |
| "epoch": 1.9001490312965723, |
| "grad_norm": 16.171571731567383, |
| "learning_rate": 3.774783923330694e-08, |
| "loss": 1.0887, |
| "step": 319 |
| }, |
| { |
| "epoch": 1.9061102831594634, |
| "grad_norm": 11.587321281433105, |
| "learning_rate": 3.3679322425172466e-08, |
| "loss": 1.0143, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.9120715350223547, |
| "grad_norm": 28.936967849731445, |
| "learning_rate": 2.984125692945872e-08, |
| "loss": 1.2466, |
| "step": 321 |
| }, |
| { |
| "epoch": 1.918032786885246, |
| "grad_norm": 11.389568328857422, |
| "learning_rate": 2.6234001372372196e-08, |
| "loss": 1.1364, |
| "step": 322 |
| }, |
| { |
| "epoch": 1.9239940387481371, |
| "grad_norm": 11.347761154174805, |
| "learning_rate": 2.2857892813398785e-08, |
| "loss": 1.0225, |
| "step": 323 |
| }, |
| { |
| "epoch": 1.9299552906110282, |
| "grad_norm": 17.69095802307129, |
| "learning_rate": 1.9713246713805588e-08, |
| "loss": 1.0806, |
| "step": 324 |
| }, |
| { |
| "epoch": 1.9359165424739195, |
| "grad_norm": 18.81931495666504, |
| "learning_rate": 1.680035690716758e-08, |
| "loss": 1.1148, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.9418777943368108, |
| "grad_norm": 14.300539016723633, |
| "learning_rate": 1.411949557191039e-08, |
| "loss": 1.1152, |
| "step": 326 |
| }, |
| { |
| "epoch": 1.947839046199702, |
| "grad_norm": 14.622347831726074, |
| "learning_rate": 1.1670913205878431e-08, |
| "loss": 1.0456, |
| "step": 327 |
| }, |
| { |
| "epoch": 1.953800298062593, |
| "grad_norm": 15.9490327835083, |
| "learning_rate": 9.454838602928341e-09, |
| "loss": 1.0858, |
| "step": 328 |
| }, |
| { |
| "epoch": 1.9597615499254843, |
| "grad_norm": 17.17402458190918, |
| "learning_rate": 7.471478831550804e-09, |
| "loss": 1.0958, |
| "step": 329 |
| }, |
| { |
| "epoch": 1.9657228017883757, |
| "grad_norm": 15.29993724822998, |
| "learning_rate": 5.721019215522428e-09, |
| "loss": 1.0968, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.9716840536512668, |
| "grad_norm": 20.376798629760742, |
| "learning_rate": 4.2036233165893006e-09, |
| "loss": 1.1139, |
| "step": 331 |
| }, |
| { |
| "epoch": 1.9776453055141578, |
| "grad_norm": 38.81945037841797, |
| "learning_rate": 2.919432919183396e-09, |
| "loss": 1.1274, |
| "step": 332 |
| }, |
| { |
| "epoch": 1.9836065573770492, |
| "grad_norm": 54.10487747192383, |
| "learning_rate": 1.8685680171745547e-09, |
| "loss": 1.1034, |
| "step": 333 |
| }, |
| { |
| "epoch": 1.9895678092399405, |
| "grad_norm": 18.45663070678711, |
| "learning_rate": 1.051126802658342e-09, |
| "loss": 1.0882, |
| "step": 334 |
| }, |
| { |
| "epoch": 1.9955290611028316, |
| "grad_norm": 22.40350341796875, |
| "learning_rate": 4.671856567811661e-10, |
| "loss": 1.2242, |
| "step": 335 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 11.405494689941406, |
| "learning_rate": 1.167991426032078e-10, |
| "loss": 0.8141, |
| "step": 336 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 336, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.533791144373125e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|