| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 720, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.013888888888888888, |
| "grad_norm": 1.3303340673446655, |
| "learning_rate": 1.3333333333333334e-06, |
| "loss": 1.2758, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.027777777777777776, |
| "grad_norm": 0.8942481279373169, |
| "learning_rate": 3e-06, |
| "loss": 1.261, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.041666666666666664, |
| "grad_norm": 0.69441157579422, |
| "learning_rate": 4.666666666666667e-06, |
| "loss": 1.2394, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.05555555555555555, |
| "grad_norm": 0.5783836841583252, |
| "learning_rate": 6.333333333333333e-06, |
| "loss": 1.28, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.06944444444444445, |
| "grad_norm": 0.6185364127159119, |
| "learning_rate": 8e-06, |
| "loss": 1.2652, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.08333333333333333, |
| "grad_norm": 0.5661951899528503, |
| "learning_rate": 9.666666666666667e-06, |
| "loss": 1.2474, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.09722222222222222, |
| "grad_norm": 0.5511808395385742, |
| "learning_rate": 1.1333333333333334e-05, |
| "loss": 1.204, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.1111111111111111, |
| "grad_norm": 0.46661698818206787, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 1.2092, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 0.4330252707004547, |
| "learning_rate": 1.4666666666666666e-05, |
| "loss": 1.1796, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.1388888888888889, |
| "grad_norm": 0.4823797941207886, |
| "learning_rate": 1.633333333333333e-05, |
| "loss": 1.2358, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.1527777777777778, |
| "grad_norm": 0.45977669954299927, |
| "learning_rate": 1.8e-05, |
| "loss": 1.179, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.16666666666666666, |
| "grad_norm": 0.5074872374534607, |
| "learning_rate": 1.9666666666666666e-05, |
| "loss": 1.1703, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.18055555555555555, |
| "grad_norm": 0.47606608271598816, |
| "learning_rate": 2.1333333333333335e-05, |
| "loss": 1.1341, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.19444444444444445, |
| "grad_norm": 0.5941349864006042, |
| "learning_rate": 2.3000000000000003e-05, |
| "loss": 1.1718, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.20833333333333334, |
| "grad_norm": 0.5346580743789673, |
| "learning_rate": 2.4666666666666665e-05, |
| "loss": 1.151, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.2222222222222222, |
| "grad_norm": 0.49426817893981934, |
| "learning_rate": 2.6333333333333334e-05, |
| "loss": 1.0661, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.2361111111111111, |
| "grad_norm": 0.425722599029541, |
| "learning_rate": 2.8e-05, |
| "loss": 1.044, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.5695877075195312, |
| "learning_rate": 2.966666666666667e-05, |
| "loss": 1.1016, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.2638888888888889, |
| "grad_norm": 0.5956630706787109, |
| "learning_rate": 2.999959497035049e-05, |
| "loss": 1.0176, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.2777777777777778, |
| "grad_norm": 0.5605266690254211, |
| "learning_rate": 2.9997949574887035e-05, |
| "loss": 1.0613, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.2916666666666667, |
| "grad_norm": 0.6029469966888428, |
| "learning_rate": 2.99950386379898e-05, |
| "loss": 1.0369, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.3055555555555556, |
| "grad_norm": 0.5131080150604248, |
| "learning_rate": 2.9990862405286438e-05, |
| "loss": 1.0467, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.3194444444444444, |
| "grad_norm": 0.6137083172798157, |
| "learning_rate": 2.998542122917149e-05, |
| "loss": 1.0145, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.3333333333333333, |
| "grad_norm": 0.5534991025924683, |
| "learning_rate": 2.997871556877664e-05, |
| "loss": 1.0217, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.3472222222222222, |
| "grad_norm": 0.7161787152290344, |
| "learning_rate": 2.997074598993197e-05, |
| "loss": 1.0402, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.3611111111111111, |
| "grad_norm": 0.594761312007904, |
| "learning_rate": 2.9961513165118224e-05, |
| "loss": 1.0464, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.375, |
| "grad_norm": 0.6094787120819092, |
| "learning_rate": 2.9951017873410053e-05, |
| "loss": 0.9971, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.3888888888888889, |
| "grad_norm": 0.5699455142021179, |
| "learning_rate": 2.993926100041028e-05, |
| "loss": 0.9726, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.4027777777777778, |
| "grad_norm": 0.726179301738739, |
| "learning_rate": 2.9926243538175172e-05, |
| "loss": 0.9568, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.4166666666666667, |
| "grad_norm": 0.7146620154380798, |
| "learning_rate": 2.9911966585130737e-05, |
| "loss": 0.9367, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.4305555555555556, |
| "grad_norm": 0.7823082804679871, |
| "learning_rate": 2.9896431345980032e-05, |
| "loss": 0.9897, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.4444444444444444, |
| "grad_norm": 0.6647103428840637, |
| "learning_rate": 2.9879639131601504e-05, |
| "loss": 0.9398, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.4583333333333333, |
| "grad_norm": 0.6053484678268433, |
| "learning_rate": 2.9861591358938368e-05, |
| "loss": 0.958, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.4722222222222222, |
| "grad_norm": 0.629128634929657, |
| "learning_rate": 2.9842289550879072e-05, |
| "loss": 0.9099, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.4861111111111111, |
| "grad_norm": 0.7081131935119629, |
| "learning_rate": 2.9821735336128774e-05, |
| "loss": 0.8751, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.7313970923423767, |
| "learning_rate": 2.9799930449071926e-05, |
| "loss": 0.8703, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.5138888888888888, |
| "grad_norm": 0.8824923038482666, |
| "learning_rate": 2.9776876729625894e-05, |
| "loss": 0.9141, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.5277777777777778, |
| "grad_norm": 0.6370046734809875, |
| "learning_rate": 2.9752576123085737e-05, |
| "loss": 0.8878, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.5416666666666666, |
| "grad_norm": 0.7419236898422241, |
| "learning_rate": 2.9727030679960043e-05, |
| "loss": 0.8464, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.5555555555555556, |
| "grad_norm": 0.8200927376747131, |
| "learning_rate": 2.970024255579792e-05, |
| "loss": 0.8303, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5694444444444444, |
| "grad_norm": 0.7647119760513306, |
| "learning_rate": 2.9672214011007087e-05, |
| "loss": 0.9101, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.5833333333333334, |
| "grad_norm": 0.8843978643417358, |
| "learning_rate": 2.9642947410663154e-05, |
| "loss": 0.8749, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.5972222222222222, |
| "grad_norm": 0.8577505350112915, |
| "learning_rate": 2.9612445224310053e-05, |
| "loss": 0.8437, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.6111111111111112, |
| "grad_norm": 0.8507093191146851, |
| "learning_rate": 2.958071002575166e-05, |
| "loss": 0.8313, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.625, |
| "grad_norm": 0.861611545085907, |
| "learning_rate": 2.9547744492834593e-05, |
| "loss": 0.8744, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.6388888888888888, |
| "grad_norm": 0.7713214159011841, |
| "learning_rate": 2.951355140722228e-05, |
| "loss": 0.8769, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.6527777777777778, |
| "grad_norm": 0.9293366074562073, |
| "learning_rate": 2.947813365416023e-05, |
| "loss": 0.8354, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.6666666666666666, |
| "grad_norm": 0.8537072539329529, |
| "learning_rate": 2.9441494222232568e-05, |
| "loss": 0.7734, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.6805555555555556, |
| "grad_norm": 0.8877134323120117, |
| "learning_rate": 2.940363620310985e-05, |
| "loss": 0.8141, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.6944444444444444, |
| "grad_norm": 0.9355044960975647, |
| "learning_rate": 2.9364562791288207e-05, |
| "loss": 0.7648, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.7083333333333334, |
| "grad_norm": 0.8050118088722229, |
| "learning_rate": 2.9324277283819768e-05, |
| "loss": 0.8067, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.7222222222222222, |
| "grad_norm": 0.7927704453468323, |
| "learning_rate": 2.9282783080034466e-05, |
| "loss": 0.7921, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.7361111111111112, |
| "grad_norm": 0.7963441610336304, |
| "learning_rate": 2.9240083681253192e-05, |
| "loss": 0.8044, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 0.9507607817649841, |
| "learning_rate": 2.9196182690492342e-05, |
| "loss": 0.758, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.7638888888888888, |
| "grad_norm": 0.8225581049919128, |
| "learning_rate": 2.915108381215982e-05, |
| "loss": 0.7465, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.7777777777777778, |
| "grad_norm": 0.9361241459846497, |
| "learning_rate": 2.910479085174242e-05, |
| "loss": 0.8049, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.7916666666666666, |
| "grad_norm": 0.9086710214614868, |
| "learning_rate": 2.9057307715484746e-05, |
| "loss": 0.7478, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.8055555555555556, |
| "grad_norm": 0.9152787327766418, |
| "learning_rate": 2.900863841005957e-05, |
| "loss": 0.7627, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.8194444444444444, |
| "grad_norm": 0.9706355333328247, |
| "learning_rate": 2.895878704222978e-05, |
| "loss": 0.7092, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.8333333333333334, |
| "grad_norm": 0.8699019551277161, |
| "learning_rate": 2.890775781850181e-05, |
| "loss": 0.7574, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.8472222222222222, |
| "grad_norm": 0.9554646015167236, |
| "learning_rate": 2.8855555044770728e-05, |
| "loss": 0.7224, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.8611111111111112, |
| "grad_norm": 0.9026999473571777, |
| "learning_rate": 2.880218312595687e-05, |
| "loss": 0.6757, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.875, |
| "grad_norm": 0.8771184086799622, |
| "learning_rate": 2.8747646565634157e-05, |
| "loss": 0.7325, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.8888888888888888, |
| "grad_norm": 1.1735718250274658, |
| "learning_rate": 2.8691949965650092e-05, |
| "loss": 0.6886, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.9027777777777778, |
| "grad_norm": 0.8841818571090698, |
| "learning_rate": 2.863509802573744e-05, |
| "loss": 0.6573, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.9166666666666666, |
| "grad_norm": 0.9687197804450989, |
| "learning_rate": 2.8577095543117655e-05, |
| "loss": 0.6765, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.9305555555555556, |
| "grad_norm": 0.9504878520965576, |
| "learning_rate": 2.8517947412096107e-05, |
| "loss": 0.6933, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.9444444444444444, |
| "grad_norm": 1.0276504755020142, |
| "learning_rate": 2.8457658623649063e-05, |
| "loss": 0.6655, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.9583333333333334, |
| "grad_norm": 0.9590686559677124, |
| "learning_rate": 2.8396234265002583e-05, |
| "loss": 0.6441, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.9722222222222222, |
| "grad_norm": 0.9926571249961853, |
| "learning_rate": 2.8333679519203223e-05, |
| "loss": 0.6189, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.9861111111111112, |
| "grad_norm": 1.0292668342590332, |
| "learning_rate": 2.826999966468069e-05, |
| "loss": 0.6611, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 1.0787709951400757, |
| "learning_rate": 2.820520007480246e-05, |
| "loss": 0.7084, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.0138888888888888, |
| "grad_norm": 0.9844677448272705, |
| "learning_rate": 2.8139286217420356e-05, |
| "loss": 0.5665, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.0277777777777777, |
| "grad_norm": 0.9413930773735046, |
| "learning_rate": 2.807226365440916e-05, |
| "loss": 0.5811, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.0416666666666667, |
| "grad_norm": 1.0975089073181152, |
| "learning_rate": 2.8004138041197307e-05, |
| "loss": 0.5766, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.0555555555555556, |
| "grad_norm": 1.1106640100479126, |
| "learning_rate": 2.7934915126289675e-05, |
| "loss": 0.5595, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.0694444444444444, |
| "grad_norm": 0.9964764714241028, |
| "learning_rate": 2.7864600750782507e-05, |
| "loss": 0.5701, |
| "step": 385 |
| }, |
| { |
| "epoch": 1.0833333333333333, |
| "grad_norm": 1.0286003351211548, |
| "learning_rate": 2.7793200847870544e-05, |
| "loss": 0.5003, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.0972222222222223, |
| "grad_norm": 1.0146455764770508, |
| "learning_rate": 2.772072144234639e-05, |
| "loss": 0.5854, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.1111111111111112, |
| "grad_norm": 1.1295380592346191, |
| "learning_rate": 2.76471686500921e-05, |
| "loss": 0.6124, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.125, |
| "grad_norm": 1.013516902923584, |
| "learning_rate": 2.757254867756314e-05, |
| "loss": 0.5504, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.1388888888888888, |
| "grad_norm": 1.05109441280365, |
| "learning_rate": 2.7496867821264688e-05, |
| "loss": 0.5176, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.1527777777777777, |
| "grad_norm": 0.8996031880378723, |
| "learning_rate": 2.74201324672203e-05, |
| "loss": 0.5452, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.1666666666666667, |
| "grad_norm": 0.9900648593902588, |
| "learning_rate": 2.734234909043308e-05, |
| "loss": 0.5308, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.1805555555555556, |
| "grad_norm": 1.0687711238861084, |
| "learning_rate": 2.7263524254339294e-05, |
| "loss": 0.5086, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.1944444444444444, |
| "grad_norm": 1.0682910680770874, |
| "learning_rate": 2.718366461025455e-05, |
| "loss": 0.4955, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.2083333333333333, |
| "grad_norm": 1.0465431213378906, |
| "learning_rate": 2.7102776896812554e-05, |
| "loss": 0.5502, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.2222222222222223, |
| "grad_norm": 1.0831612348556519, |
| "learning_rate": 2.7020867939396483e-05, |
| "loss": 0.5047, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.2361111111111112, |
| "grad_norm": 0.9753193855285645, |
| "learning_rate": 2.6937944649563078e-05, |
| "loss": 0.5268, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 1.2069863080978394, |
| "learning_rate": 2.685401402445942e-05, |
| "loss": 0.4902, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.2638888888888888, |
| "grad_norm": 1.0545098781585693, |
| "learning_rate": 2.6769083146232514e-05, |
| "loss": 0.4788, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.2777777777777777, |
| "grad_norm": 1.0795210599899292, |
| "learning_rate": 2.668315918143169e-05, |
| "loss": 0.5468, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.2916666666666667, |
| "grad_norm": 1.2133684158325195, |
| "learning_rate": 2.6596249380403864e-05, |
| "loss": 0.48, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.3055555555555556, |
| "grad_norm": 1.0929421186447144, |
| "learning_rate": 2.6508361076681776e-05, |
| "loss": 0.4918, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.3194444444444444, |
| "grad_norm": 0.9951267838478088, |
| "learning_rate": 2.641950168636517e-05, |
| "loss": 0.4974, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.3333333333333333, |
| "grad_norm": 1.131229043006897, |
| "learning_rate": 2.6329678707495002e-05, |
| "loss": 0.4925, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.3472222222222223, |
| "grad_norm": 1.0469568967819214, |
| "learning_rate": 2.623889971942076e-05, |
| "loss": 0.4916, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.3611111111111112, |
| "grad_norm": 1.0473977327346802, |
| "learning_rate": 2.6147172382160913e-05, |
| "loss": 0.4861, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.375, |
| "grad_norm": 1.0654410123825073, |
| "learning_rate": 2.605450443575655e-05, |
| "loss": 0.5142, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.3888888888888888, |
| "grad_norm": 1.1624093055725098, |
| "learning_rate": 2.5960903699618263e-05, |
| "loss": 0.477, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.4027777777777777, |
| "grad_norm": 1.1483854055404663, |
| "learning_rate": 2.5866378071866338e-05, |
| "loss": 0.5209, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.4166666666666667, |
| "grad_norm": 0.9577386975288391, |
| "learning_rate": 2.5770935528664304e-05, |
| "loss": 0.4941, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.4305555555555556, |
| "grad_norm": 1.0291787385940552, |
| "learning_rate": 2.5674584123545908e-05, |
| "loss": 0.4468, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.4444444444444444, |
| "grad_norm": 1.153069019317627, |
| "learning_rate": 2.5577331986735536e-05, |
| "loss": 0.4491, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.4583333333333333, |
| "grad_norm": 1.2133917808532715, |
| "learning_rate": 2.5479187324462184e-05, |
| "loss": 0.4373, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.4722222222222223, |
| "grad_norm": 1.123104453086853, |
| "learning_rate": 2.5380158418267e-05, |
| "loss": 0.4666, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.4861111111111112, |
| "grad_norm": 0.8907086849212646, |
| "learning_rate": 2.52802536243045e-05, |
| "loss": 0.4511, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 1.0025827884674072, |
| "learning_rate": 2.517948137263743e-05, |
| "loss": 0.4249, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.5138888888888888, |
| "grad_norm": 1.0645135641098022, |
| "learning_rate": 2.5077850166525484e-05, |
| "loss": 0.4483, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.5277777777777777, |
| "grad_norm": 1.1788437366485596, |
| "learning_rate": 2.4975368581707724e-05, |
| "loss": 0.4472, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.5416666666666665, |
| "grad_norm": 1.0517628192901611, |
| "learning_rate": 2.4872045265679024e-05, |
| "loss": 0.4178, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.5555555555555556, |
| "grad_norm": 1.0361683368682861, |
| "learning_rate": 2.4767888936960312e-05, |
| "loss": 0.4179, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.5694444444444444, |
| "grad_norm": 1.1737172603607178, |
| "learning_rate": 2.4662908384362964e-05, |
| "loss": 0.4989, |
| "step": 565 |
| }, |
| { |
| "epoch": 1.5833333333333335, |
| "grad_norm": 1.106797695159912, |
| "learning_rate": 2.455711246624713e-05, |
| "loss": 0.3939, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.5972222222222223, |
| "grad_norm": 1.0679595470428467, |
| "learning_rate": 2.4450510109774314e-05, |
| "loss": 0.4174, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.6111111111111112, |
| "grad_norm": 1.3460017442703247, |
| "learning_rate": 2.434311031015404e-05, |
| "loss": 0.4243, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.625, |
| "grad_norm": 1.1296610832214355, |
| "learning_rate": 2.4234922129884873e-05, |
| "loss": 0.4456, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.6388888888888888, |
| "grad_norm": 1.2325292825698853, |
| "learning_rate": 2.4125954697989694e-05, |
| "loss": 0.4114, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.6527777777777777, |
| "grad_norm": 1.159067988395691, |
| "learning_rate": 2.4016217209245377e-05, |
| "loss": 0.4596, |
| "step": 595 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 1.2470641136169434, |
| "learning_rate": 2.3905718923406947e-05, |
| "loss": 0.3935, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.6805555555555556, |
| "grad_norm": 1.0686368942260742, |
| "learning_rate": 2.37944691644262e-05, |
| "loss": 0.418, |
| "step": 605 |
| }, |
| { |
| "epoch": 1.6944444444444444, |
| "grad_norm": 1.0233144760131836, |
| "learning_rate": 2.368247731966499e-05, |
| "loss": 0.3572, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.7083333333333335, |
| "grad_norm": 1.0826553106307983, |
| "learning_rate": 2.356975283910306e-05, |
| "loss": 0.4114, |
| "step": 615 |
| }, |
| { |
| "epoch": 1.7222222222222223, |
| "grad_norm": 1.1858890056610107, |
| "learning_rate": 2.3456305234540677e-05, |
| "loss": 0.3592, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.7361111111111112, |
| "grad_norm": 1.117847204208374, |
| "learning_rate": 2.3342144078796007e-05, |
| "loss": 0.4112, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.75, |
| "grad_norm": 1.130576491355896, |
| "learning_rate": 2.3227279004897358e-05, |
| "loss": 0.4087, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.7638888888888888, |
| "grad_norm": 1.0268139839172363, |
| "learning_rate": 2.3111719705270314e-05, |
| "loss": 0.4078, |
| "step": 635 |
| }, |
| { |
| "epoch": 1.7777777777777777, |
| "grad_norm": 1.1426600217819214, |
| "learning_rate": 2.2995475930919907e-05, |
| "loss": 0.403, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.7916666666666665, |
| "grad_norm": 1.0689858198165894, |
| "learning_rate": 2.2878557490607802e-05, |
| "loss": 0.3588, |
| "step": 645 |
| }, |
| { |
| "epoch": 1.8055555555555556, |
| "grad_norm": 1.174478530883789, |
| "learning_rate": 2.276097425002464e-05, |
| "loss": 0.3848, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.8194444444444444, |
| "grad_norm": 1.0673271417617798, |
| "learning_rate": 2.2642736130957522e-05, |
| "loss": 0.4206, |
| "step": 655 |
| }, |
| { |
| "epoch": 1.8333333333333335, |
| "grad_norm": 1.1517634391784668, |
| "learning_rate": 2.2523853110452853e-05, |
| "loss": 0.3915, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.8472222222222223, |
| "grad_norm": 1.1096245050430298, |
| "learning_rate": 2.240433521997443e-05, |
| "loss": 0.3316, |
| "step": 665 |
| }, |
| { |
| "epoch": 1.8611111111111112, |
| "grad_norm": 1.089471459388733, |
| "learning_rate": 2.2284192544556988e-05, |
| "loss": 0.3748, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.875, |
| "grad_norm": 1.1131129264831543, |
| "learning_rate": 2.216343522195523e-05, |
| "loss": 0.3758, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.8888888888888888, |
| "grad_norm": 1.0454002618789673, |
| "learning_rate": 2.2042073441788363e-05, |
| "loss": 0.354, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.9027777777777777, |
| "grad_norm": 1.0819425582885742, |
| "learning_rate": 2.1920117444680317e-05, |
| "loss": 0.3419, |
| "step": 685 |
| }, |
| { |
| "epoch": 1.9166666666666665, |
| "grad_norm": 1.09736967086792, |
| "learning_rate": 2.179757752139562e-05, |
| "loss": 0.3783, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.9305555555555556, |
| "grad_norm": 1.1822260618209839, |
| "learning_rate": 2.1674464011971048e-05, |
| "loss": 0.3788, |
| "step": 695 |
| }, |
| { |
| "epoch": 1.9444444444444444, |
| "grad_norm": 1.1565293073654175, |
| "learning_rate": 2.155078730484313e-05, |
| "loss": 0.3549, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.9583333333333335, |
| "grad_norm": 1.2186641693115234, |
| "learning_rate": 2.1426557835971566e-05, |
| "loss": 0.3476, |
| "step": 705 |
| }, |
| { |
| "epoch": 1.9722222222222223, |
| "grad_norm": 1.2447794675827026, |
| "learning_rate": 2.1301786087958617e-05, |
| "loss": 0.3758, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.9861111111111112, |
| "grad_norm": 1.1461338996887207, |
| "learning_rate": 2.1176482589164575e-05, |
| "loss": 0.3405, |
| "step": 715 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 1.123814582824707, |
| "learning_rate": 2.105065791281939e-05, |
| "loss": 0.345, |
| "step": 720 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1800, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.065684080953131e+18, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|