| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.1999980000199998, |
| "eval_steps": 500, |
| "global_step": 20000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.000999990000099999, |
| "grad_norm": 5341.90380859375, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 594.5605, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.001999980000199998, |
| "grad_norm": 789.9061889648438, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 127.941, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.002999970000299997, |
| "grad_norm": 306.2316589355469, |
| "learning_rate": 6e-06, |
| "loss": 119.2364, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.003999960000399996, |
| "grad_norm": 751.376220703125, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 118.0761, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.004999950000499995, |
| "grad_norm": 335.5125732421875, |
| "learning_rate": 1e-05, |
| "loss": 109.0291, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.005999940000599994, |
| "grad_norm": 510.80975341796875, |
| "learning_rate": 1.2e-05, |
| "loss": 109.4396, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.006999930000699993, |
| "grad_norm": 314.0848083496094, |
| "learning_rate": 1.4e-05, |
| "loss": 84.1079, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.007999920000799993, |
| "grad_norm": 935.3131713867188, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 82.2216, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.008999910000899992, |
| "grad_norm": 1358.540771484375, |
| "learning_rate": 1.8e-05, |
| "loss": 80.6757, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.00999990000099999, |
| "grad_norm": 317.9821472167969, |
| "learning_rate": 2e-05, |
| "loss": 62.5531, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.010999890001099988, |
| "grad_norm": 649.0853271484375, |
| "learning_rate": 2.2000000000000003e-05, |
| "loss": 56.7378, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.011999880001199987, |
| "grad_norm": 530.7359008789062, |
| "learning_rate": 2.4e-05, |
| "loss": 56.3481, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.012999870001299986, |
| "grad_norm": 241.44517517089844, |
| "learning_rate": 2.6000000000000002e-05, |
| "loss": 53.1624, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.013999860001399985, |
| "grad_norm": 1705.9263916015625, |
| "learning_rate": 2.8e-05, |
| "loss": 57.0686, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.014999850001499985, |
| "grad_norm": 1379.0430908203125, |
| "learning_rate": 3.0000000000000004e-05, |
| "loss": 47.7146, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.015999840001599985, |
| "grad_norm": 253.24891662597656, |
| "learning_rate": 3.2000000000000005e-05, |
| "loss": 42.4126, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.016999830001699984, |
| "grad_norm": 258.5624694824219, |
| "learning_rate": 3.4e-05, |
| "loss": 48.3719, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.017999820001799983, |
| "grad_norm": 273.0712585449219, |
| "learning_rate": 3.6e-05, |
| "loss": 38.7285, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.018999810001899983, |
| "grad_norm": 820.5573120117188, |
| "learning_rate": 3.8e-05, |
| "loss": 44.3776, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.01999980000199998, |
| "grad_norm": 538.52587890625, |
| "learning_rate": 4e-05, |
| "loss": 33.6354, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.02099979000209998, |
| "grad_norm": 987.8306884765625, |
| "learning_rate": 3.999997482501191e-05, |
| "loss": 39.2998, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.021999780002199976, |
| "grad_norm": 963.3218383789062, |
| "learning_rate": 3.9999899300111024e-05, |
| "loss": 30.5145, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.022999770002299975, |
| "grad_norm": 384.54278564453125, |
| "learning_rate": 3.999977342548747e-05, |
| "loss": 26.2319, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.023999760002399975, |
| "grad_norm": 362.4068908691406, |
| "learning_rate": 3.9999597201458134e-05, |
| "loss": 21.7486, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.024999750002499974, |
| "grad_norm": 246.06344604492188, |
| "learning_rate": 3.9999370628466666e-05, |
| "loss": 18.8825, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.025999740002599973, |
| "grad_norm": 558.910888671875, |
| "learning_rate": 3.9999093707083455e-05, |
| "loss": 18.5258, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.026999730002699972, |
| "grad_norm": 4613.8505859375, |
| "learning_rate": 3.999876643800567e-05, |
| "loss": 20.1839, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.02799972000279997, |
| "grad_norm": 246.72665405273438, |
| "learning_rate": 3.999838882205719e-05, |
| "loss": 17.0552, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.02899971000289997, |
| "grad_norm": 85.92310333251953, |
| "learning_rate": 3.9997960860188666e-05, |
| "loss": 16.4099, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.02999970000299997, |
| "grad_norm": 607.7698364257812, |
| "learning_rate": 3.9997482553477506e-05, |
| "loss": 15.0478, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.030999690003099968, |
| "grad_norm": 348.2640075683594, |
| "learning_rate": 3.999695390312783e-05, |
| "loss": 13.6557, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.03199968000319997, |
| "grad_norm": 553.7791137695312, |
| "learning_rate": 3.999637491047052e-05, |
| "loss": 16.3278, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.032999670003299966, |
| "grad_norm": 851.149169921875, |
| "learning_rate": 3.999574557696319e-05, |
| "loss": 15.7218, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.03399966000339997, |
| "grad_norm": 131.01693725585938, |
| "learning_rate": 3.9995065904190185e-05, |
| "loss": 11.3467, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.034999650003499964, |
| "grad_norm": 41.529632568359375, |
| "learning_rate": 3.999433589386259e-05, |
| "loss": 12.6374, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.03599964000359997, |
| "grad_norm": 205.26910400390625, |
| "learning_rate": 3.9993555547818186e-05, |
| "loss": 13.2414, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.03699963000369996, |
| "grad_norm": 89.92840576171875, |
| "learning_rate": 3.999272486802151e-05, |
| "loss": 10.1512, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.037999620003799965, |
| "grad_norm": 410.4055480957031, |
| "learning_rate": 3.9991843856563786e-05, |
| "loss": 9.8745, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.03899961000389996, |
| "grad_norm": 296.5409240722656, |
| "learning_rate": 3.999091251566297e-05, |
| "loss": 12.645, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.03999960000399996, |
| "grad_norm": 49.79493713378906, |
| "learning_rate": 3.9989930847663706e-05, |
| "loss": 12.447, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.04099959000409996, |
| "grad_norm": 477.4189147949219, |
| "learning_rate": 3.998889885503734e-05, |
| "loss": 11.1971, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.04199958000419996, |
| "grad_norm": 236.76950073242188, |
| "learning_rate": 3.998781654038192e-05, |
| "loss": 11.6718, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.04299957000429996, |
| "grad_norm": 484.04681396484375, |
| "learning_rate": 3.998668390642216e-05, |
| "loss": 9.9448, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.04399956000439995, |
| "grad_norm": 313.50457763671875, |
| "learning_rate": 3.998550095600948e-05, |
| "loss": 10.5262, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.044999550004499955, |
| "grad_norm": 82.21493530273438, |
| "learning_rate": 3.998426769212194e-05, |
| "loss": 13.1368, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.04599954000459995, |
| "grad_norm": 151.22616577148438, |
| "learning_rate": 3.9982984117864285e-05, |
| "loss": 9.8138, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.04699953000469995, |
| "grad_norm": 274.2380676269531, |
| "learning_rate": 3.9981650236467916e-05, |
| "loss": 10.2387, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.04799952000479995, |
| "grad_norm": 77.92356872558594, |
| "learning_rate": 3.998026605129088e-05, |
| "loss": 9.5225, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.04899951000489995, |
| "grad_norm": 202.85816955566406, |
| "learning_rate": 3.997883156581786e-05, |
| "loss": 9.8305, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.04999950000499995, |
| "grad_norm": 121.39033508300781, |
| "learning_rate": 3.9977346783660165e-05, |
| "loss": 12.1433, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.05099949000509995, |
| "grad_norm": 703.3681030273438, |
| "learning_rate": 3.997581170855573e-05, |
| "loss": 9.3006, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.051999480005199945, |
| "grad_norm": 905.860107421875, |
| "learning_rate": 3.9974226344369124e-05, |
| "loss": 9.2302, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.05299947000529995, |
| "grad_norm": 108.33662414550781, |
| "learning_rate": 3.9972590695091476e-05, |
| "loss": 9.0692, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.053999460005399944, |
| "grad_norm": 278.9888916015625, |
| "learning_rate": 3.9970904764840554e-05, |
| "loss": 9.0078, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.054999450005499946, |
| "grad_norm": 339.5447692871094, |
| "learning_rate": 3.9969168557860665e-05, |
| "loss": 8.5263, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.05599944000559994, |
| "grad_norm": 479.4326477050781, |
| "learning_rate": 3.9967382078522716e-05, |
| "loss": 8.6827, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.056999430005699944, |
| "grad_norm": 205.8404083251953, |
| "learning_rate": 3.9965545331324166e-05, |
| "loss": 8.0084, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.05799942000579994, |
| "grad_norm": 318.3268127441406, |
| "learning_rate": 3.996365832088903e-05, |
| "loss": 7.8264, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.05899941000589994, |
| "grad_norm": 545.3519287109375, |
| "learning_rate": 3.996172105196785e-05, |
| "loss": 8.5439, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.05999940000599994, |
| "grad_norm": 536.3674926757812, |
| "learning_rate": 3.995973352943769e-05, |
| "loss": 7.2916, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.06099939000609994, |
| "grad_norm": 29.15781593322754, |
| "learning_rate": 3.995769575830215e-05, |
| "loss": 8.3879, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.061999380006199936, |
| "grad_norm": 573.3207397460938, |
| "learning_rate": 3.99556077436913e-05, |
| "loss": 6.6733, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.06299937000629993, |
| "grad_norm": 206.36526489257812, |
| "learning_rate": 3.995346949086174e-05, |
| "loss": 5.7694, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.06399936000639994, |
| "grad_norm": 16834.9296875, |
| "learning_rate": 3.9951281005196486e-05, |
| "loss": 6.1864, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.06499935000649994, |
| "grad_norm": 401.1425476074219, |
| "learning_rate": 3.994904229220507e-05, |
| "loss": 6.1338, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.06599934000659993, |
| "grad_norm": 277.509765625, |
| "learning_rate": 3.994675335752345e-05, |
| "loss": 4.8342, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.06699933000669993, |
| "grad_norm": 544.186767578125, |
| "learning_rate": 3.9944414206914e-05, |
| "loss": 8.32, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.06799932000679994, |
| "grad_norm": 174.74789428710938, |
| "learning_rate": 3.994202484626555e-05, |
| "loss": 10.03, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.06899931000689993, |
| "grad_norm": 562.16357421875, |
| "learning_rate": 3.99395852815933e-05, |
| "loss": 7.7643, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.06999930000699993, |
| "grad_norm": 219.15576171875, |
| "learning_rate": 3.993709551903885e-05, |
| "loss": 8.8717, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.07099929000709992, |
| "grad_norm": 159.7786102294922, |
| "learning_rate": 3.993455556487018e-05, |
| "loss": 7.3802, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.07199928000719993, |
| "grad_norm": 224.97128295898438, |
| "learning_rate": 3.993196542548162e-05, |
| "loss": 9.665, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.07299927000729993, |
| "grad_norm": 691.8470458984375, |
| "learning_rate": 3.992932510739383e-05, |
| "loss": 5.8392, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.07399926000739993, |
| "grad_norm": 196.7902069091797, |
| "learning_rate": 3.992663461725383e-05, |
| "loss": 5.8781, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.07499925000749992, |
| "grad_norm": 450.8050231933594, |
| "learning_rate": 3.9923893961834914e-05, |
| "loss": 8.0027, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.07599924000759993, |
| "grad_norm": 182.17953491210938, |
| "learning_rate": 3.992110314803668e-05, |
| "loss": 4.6279, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.07699923000769993, |
| "grad_norm": 176.743896484375, |
| "learning_rate": 3.9918262182884994e-05, |
| "loss": 8.5188, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.07799922000779992, |
| "grad_norm": 382.1143798828125, |
| "learning_rate": 3.9915371073531995e-05, |
| "loss": 6.4761, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.07899921000789992, |
| "grad_norm": 513.8743286132812, |
| "learning_rate": 3.991242982725603e-05, |
| "loss": 5.1712, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.07999920000799993, |
| "grad_norm": 197.9111785888672, |
| "learning_rate": 3.9909438451461695e-05, |
| "loss": 7.5148, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.08099919000809992, |
| "grad_norm": 165.2156982421875, |
| "learning_rate": 3.990639695367977e-05, |
| "loss": 5.1054, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.08199918000819992, |
| "grad_norm": 162.3336944580078, |
| "learning_rate": 3.990330534156723e-05, |
| "loss": 5.3642, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.08299917000829991, |
| "grad_norm": 149.72377014160156, |
| "learning_rate": 3.9900163622907196e-05, |
| "loss": 7.4674, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.08399916000839992, |
| "grad_norm": 110.8755874633789, |
| "learning_rate": 3.9896971805608945e-05, |
| "loss": 6.2615, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.08499915000849992, |
| "grad_norm": 97.61107635498047, |
| "learning_rate": 3.989372989770787e-05, |
| "loss": 6.0901, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.08599914000859991, |
| "grad_norm": 47.84280776977539, |
| "learning_rate": 3.989043790736547e-05, |
| "loss": 6.6694, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.08699913000869991, |
| "grad_norm": 418.85052490234375, |
| "learning_rate": 3.988709584286933e-05, |
| "loss": 6.1093, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.0879991200087999, |
| "grad_norm": 402.12933349609375, |
| "learning_rate": 3.98837037126331e-05, |
| "loss": 5.6737, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.08899911000889991, |
| "grad_norm": 136.71066284179688, |
| "learning_rate": 3.988026152519645e-05, |
| "loss": 6.0548, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.08999910000899991, |
| "grad_norm": 319.9411926269531, |
| "learning_rate": 3.9876769289225084e-05, |
| "loss": 6.1571, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.0909990900090999, |
| "grad_norm": 216.7753143310547, |
| "learning_rate": 3.9873227013510714e-05, |
| "loss": 4.6247, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.0919990800091999, |
| "grad_norm": 554.5048828125, |
| "learning_rate": 3.9869634706971e-05, |
| "loss": 6.5048, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.09299907000929991, |
| "grad_norm": 131.7164764404297, |
| "learning_rate": 3.986599237864959e-05, |
| "loss": 6.2413, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.0939990600093999, |
| "grad_norm": 102.79979705810547, |
| "learning_rate": 3.9862300037716025e-05, |
| "loss": 5.7489, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.0949990500094999, |
| "grad_norm": 37.355709075927734, |
| "learning_rate": 3.9858557693465766e-05, |
| "loss": 5.7535, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.0959990400095999, |
| "grad_norm": 137.89903259277344, |
| "learning_rate": 3.985476535532018e-05, |
| "loss": 4.4851, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.09699903000969991, |
| "grad_norm": 174.57391357421875, |
| "learning_rate": 3.985092303282645e-05, |
| "loss": 9.4012, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.0979990200097999, |
| "grad_norm": 99.09408569335938, |
| "learning_rate": 3.9847030735657624e-05, |
| "loss": 7.2147, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.0989990100098999, |
| "grad_norm": 265.9083557128906, |
| "learning_rate": 3.984308847361257e-05, |
| "loss": 7.8453, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.0999990000099999, |
| "grad_norm": 240.05801391601562, |
| "learning_rate": 3.983909625661591e-05, |
| "loss": 6.4133, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.1009989900100999, |
| "grad_norm": 245.1486053466797, |
| "learning_rate": 3.983505409471806e-05, |
| "loss": 5.5553, |
| "step": 10100 |
| }, |
| { |
| "epoch": 0.1019989800101999, |
| "grad_norm": 654.8526000976562, |
| "learning_rate": 3.9830961998095146e-05, |
| "loss": 4.5454, |
| "step": 10200 |
| }, |
| { |
| "epoch": 0.1029989700102999, |
| "grad_norm": 49.497371673583984, |
| "learning_rate": 3.982681997704902e-05, |
| "loss": 4.0421, |
| "step": 10300 |
| }, |
| { |
| "epoch": 0.10399896001039989, |
| "grad_norm": 119.20033264160156, |
| "learning_rate": 3.982262804200723e-05, |
| "loss": 4.6029, |
| "step": 10400 |
| }, |
| { |
| "epoch": 0.1049989500104999, |
| "grad_norm": 595.1155395507812, |
| "learning_rate": 3.981838620352294e-05, |
| "loss": 3.8456, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.1059989400105999, |
| "grad_norm": 29.171674728393555, |
| "learning_rate": 3.9814094472275e-05, |
| "loss": 4.0763, |
| "step": 10600 |
| }, |
| { |
| "epoch": 0.10699893001069989, |
| "grad_norm": 359.91021728515625, |
| "learning_rate": 3.9809752859067823e-05, |
| "loss": 3.6656, |
| "step": 10700 |
| }, |
| { |
| "epoch": 0.10799892001079989, |
| "grad_norm": 137.34120178222656, |
| "learning_rate": 3.980536137483141e-05, |
| "loss": 5.2409, |
| "step": 10800 |
| }, |
| { |
| "epoch": 0.1089989100108999, |
| "grad_norm": 359.36151123046875, |
| "learning_rate": 3.9800920030621334e-05, |
| "loss": 7.5297, |
| "step": 10900 |
| }, |
| { |
| "epoch": 0.10999890001099989, |
| "grad_norm": 609.0127563476562, |
| "learning_rate": 3.979642883761866e-05, |
| "loss": 5.2982, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.11099889001109989, |
| "grad_norm": 1026.407470703125, |
| "learning_rate": 3.979188780712996e-05, |
| "loss": 5.7533, |
| "step": 11100 |
| }, |
| { |
| "epoch": 0.11199888001119988, |
| "grad_norm": 1161.3192138671875, |
| "learning_rate": 3.978729695058729e-05, |
| "loss": 3.8523, |
| "step": 11200 |
| }, |
| { |
| "epoch": 0.11299887001129989, |
| "grad_norm": 207.47286987304688, |
| "learning_rate": 3.9782656279548114e-05, |
| "loss": 5.7909, |
| "step": 11300 |
| }, |
| { |
| "epoch": 0.11399886001139989, |
| "grad_norm": 180.18321228027344, |
| "learning_rate": 3.9777965805695315e-05, |
| "loss": 4.8786, |
| "step": 11400 |
| }, |
| { |
| "epoch": 0.11499885001149988, |
| "grad_norm": 154.166259765625, |
| "learning_rate": 3.977322554083716e-05, |
| "loss": 5.9407, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.11599884001159988, |
| "grad_norm": 175.445556640625, |
| "learning_rate": 3.976843549690725e-05, |
| "loss": 3.8969, |
| "step": 11600 |
| }, |
| { |
| "epoch": 0.11699883001169989, |
| "grad_norm": 204.4251251220703, |
| "learning_rate": 3.976359568596453e-05, |
| "loss": 4.1672, |
| "step": 11700 |
| }, |
| { |
| "epoch": 0.11799882001179988, |
| "grad_norm": 330.901123046875, |
| "learning_rate": 3.97587061201932e-05, |
| "loss": 4.1925, |
| "step": 11800 |
| }, |
| { |
| "epoch": 0.11899881001189988, |
| "grad_norm": 563.4291381835938, |
| "learning_rate": 3.9753766811902756e-05, |
| "loss": 3.4035, |
| "step": 11900 |
| }, |
| { |
| "epoch": 0.11999880001199988, |
| "grad_norm": 57.178955078125, |
| "learning_rate": 3.974877777352789e-05, |
| "loss": 3.4773, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.12099879001209989, |
| "grad_norm": 94.94606018066406, |
| "learning_rate": 3.97437390176285e-05, |
| "loss": 3.5796, |
| "step": 12100 |
| }, |
| { |
| "epoch": 0.12199878001219988, |
| "grad_norm": 801.7224731445312, |
| "learning_rate": 3.973865055688965e-05, |
| "loss": 3.4456, |
| "step": 12200 |
| }, |
| { |
| "epoch": 0.12299877001229988, |
| "grad_norm": 211.559814453125, |
| "learning_rate": 3.973351240412153e-05, |
| "loss": 5.9604, |
| "step": 12300 |
| }, |
| { |
| "epoch": 0.12399876001239987, |
| "grad_norm": 47.33822250366211, |
| "learning_rate": 3.972832457225944e-05, |
| "loss": 1.6479, |
| "step": 12400 |
| }, |
| { |
| "epoch": 0.12499875001249988, |
| "grad_norm": 517.2850341796875, |
| "learning_rate": 3.972308707436374e-05, |
| "loss": 4.1837, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.12599874001259986, |
| "grad_norm": 168.18865966796875, |
| "learning_rate": 3.971779992361981e-05, |
| "loss": 3.7355, |
| "step": 12600 |
| }, |
| { |
| "epoch": 0.12699873001269987, |
| "grad_norm": 582.1716918945312, |
| "learning_rate": 3.971246313333807e-05, |
| "loss": 4.9486, |
| "step": 12700 |
| }, |
| { |
| "epoch": 0.12799872001279988, |
| "grad_norm": 337.09344482421875, |
| "learning_rate": 3.9707076716953866e-05, |
| "loss": 4.6692, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.12899871001289986, |
| "grad_norm": 383.8667297363281, |
| "learning_rate": 3.97016406880275e-05, |
| "loss": 3.3209, |
| "step": 12900 |
| }, |
| { |
| "epoch": 0.12999870001299987, |
| "grad_norm": 109.9424819946289, |
| "learning_rate": 3.9696155060244166e-05, |
| "loss": 3.5282, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.13099869001309986, |
| "grad_norm": 74.46175384521484, |
| "learning_rate": 3.969061984741393e-05, |
| "loss": 3.883, |
| "step": 13100 |
| }, |
| { |
| "epoch": 0.13199868001319986, |
| "grad_norm": 170.72601318359375, |
| "learning_rate": 3.9685035063471675e-05, |
| "loss": 6.4028, |
| "step": 13200 |
| }, |
| { |
| "epoch": 0.13299867001329987, |
| "grad_norm": 588.787109375, |
| "learning_rate": 3.9679400722477096e-05, |
| "loss": 3.1633, |
| "step": 13300 |
| }, |
| { |
| "epoch": 0.13399866001339986, |
| "grad_norm": 172.7151336669922, |
| "learning_rate": 3.967371683861465e-05, |
| "loss": 4.3976, |
| "step": 13400 |
| }, |
| { |
| "epoch": 0.13499865001349987, |
| "grad_norm": 728.732666015625, |
| "learning_rate": 3.9667983426193485e-05, |
| "loss": 3.4596, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.13599864001359988, |
| "grad_norm": 530.2247314453125, |
| "learning_rate": 3.9662200499647464e-05, |
| "loss": 7.6628, |
| "step": 13600 |
| }, |
| { |
| "epoch": 0.13699863001369986, |
| "grad_norm": 1040.433837890625, |
| "learning_rate": 3.965636807353511e-05, |
| "loss": 5.9463, |
| "step": 13700 |
| }, |
| { |
| "epoch": 0.13799862001379987, |
| "grad_norm": 94.73613739013672, |
| "learning_rate": 3.9650486162539555e-05, |
| "loss": 3.4241, |
| "step": 13800 |
| }, |
| { |
| "epoch": 0.13899861001389985, |
| "grad_norm": 87.12159729003906, |
| "learning_rate": 3.964455478146848e-05, |
| "loss": 4.2988, |
| "step": 13900 |
| }, |
| { |
| "epoch": 0.13999860001399986, |
| "grad_norm": 422.29547119140625, |
| "learning_rate": 3.963857394525413e-05, |
| "loss": 2.7699, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.14099859001409987, |
| "grad_norm": 677.4222412109375, |
| "learning_rate": 3.9632543668953284e-05, |
| "loss": 3.9013, |
| "step": 14100 |
| }, |
| { |
| "epoch": 0.14199858001419985, |
| "grad_norm": 98.60820770263672, |
| "learning_rate": 3.9626463967747126e-05, |
| "loss": 2.8094, |
| "step": 14200 |
| }, |
| { |
| "epoch": 0.14299857001429986, |
| "grad_norm": 567.5491333007812, |
| "learning_rate": 3.9620334856941305e-05, |
| "loss": 1.7368, |
| "step": 14300 |
| }, |
| { |
| "epoch": 0.14399856001439987, |
| "grad_norm": 19.20868492126465, |
| "learning_rate": 3.961415635196585e-05, |
| "loss": 2.8347, |
| "step": 14400 |
| }, |
| { |
| "epoch": 0.14499855001449985, |
| "grad_norm": 131.01400756835938, |
| "learning_rate": 3.960792846837514e-05, |
| "loss": 3.9196, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.14599854001459986, |
| "grad_norm": 113.39109802246094, |
| "learning_rate": 3.960165122184787e-05, |
| "loss": 2.9107, |
| "step": 14600 |
| }, |
| { |
| "epoch": 0.14699853001469984, |
| "grad_norm": 701.5942993164062, |
| "learning_rate": 3.959532462818699e-05, |
| "loss": 1.98, |
| "step": 14700 |
| }, |
| { |
| "epoch": 0.14799852001479985, |
| "grad_norm": 83.20955657958984, |
| "learning_rate": 3.958894870331971e-05, |
| "loss": 2.093, |
| "step": 14800 |
| }, |
| { |
| "epoch": 0.14899851001489986, |
| "grad_norm": 473.39202880859375, |
| "learning_rate": 3.958252346329739e-05, |
| "loss": 3.0402, |
| "step": 14900 |
| }, |
| { |
| "epoch": 0.14999850001499984, |
| "grad_norm": 267.7691955566406, |
| "learning_rate": 3.957604892429558e-05, |
| "loss": 5.534, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.15099849001509985, |
| "grad_norm": 278.0105285644531, |
| "learning_rate": 3.956952510261392e-05, |
| "loss": 3.1192, |
| "step": 15100 |
| }, |
| { |
| "epoch": 0.15199848001519986, |
| "grad_norm": 76.60382080078125, |
| "learning_rate": 3.9562952014676116e-05, |
| "loss": 4.55, |
| "step": 15200 |
| }, |
| { |
| "epoch": 0.15299847001529984, |
| "grad_norm": 62.512725830078125, |
| "learning_rate": 3.955632967702992e-05, |
| "loss": 4.7252, |
| "step": 15300 |
| }, |
| { |
| "epoch": 0.15399846001539985, |
| "grad_norm": 52.81327819824219, |
| "learning_rate": 3.954965810634706e-05, |
| "loss": 4.3578, |
| "step": 15400 |
| }, |
| { |
| "epoch": 0.15499845001549983, |
| "grad_norm": 388.27850341796875, |
| "learning_rate": 3.954293731942319e-05, |
| "loss": 3.9787, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.15599844001559984, |
| "grad_norm": 83.5515365600586, |
| "learning_rate": 3.953616733317791e-05, |
| "loss": 0.5327, |
| "step": 15600 |
| }, |
| { |
| "epoch": 0.15699843001569985, |
| "grad_norm": 104.8029556274414, |
| "learning_rate": 3.9529348164654625e-05, |
| "loss": 5.1422, |
| "step": 15700 |
| }, |
| { |
| "epoch": 0.15799842001579983, |
| "grad_norm": 682.9906616210938, |
| "learning_rate": 3.9522479831020605e-05, |
| "loss": 0.6372, |
| "step": 15800 |
| }, |
| { |
| "epoch": 0.15899841001589984, |
| "grad_norm": 970.7451171875, |
| "learning_rate": 3.951556234956686e-05, |
| "loss": 2.5437, |
| "step": 15900 |
| }, |
| { |
| "epoch": 0.15999840001599985, |
| "grad_norm": 276.81207275390625, |
| "learning_rate": 3.950859573770815e-05, |
| "loss": 0.5738, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.16099839001609983, |
| "grad_norm": 121.99581909179688, |
| "learning_rate": 3.9501580012982894e-05, |
| "loss": 2.5115, |
| "step": 16100 |
| }, |
| { |
| "epoch": 0.16199838001619984, |
| "grad_norm": 234.561279296875, |
| "learning_rate": 3.949451519305319e-05, |
| "loss": 2.9896, |
| "step": 16200 |
| }, |
| { |
| "epoch": 0.16299837001629983, |
| "grad_norm": 576.3875732421875, |
| "learning_rate": 3.948740129570471e-05, |
| "loss": 2.834, |
| "step": 16300 |
| }, |
| { |
| "epoch": 0.16399836001639984, |
| "grad_norm": 256.8683166503906, |
| "learning_rate": 3.948023833884667e-05, |
| "loss": 2.7145, |
| "step": 16400 |
| }, |
| { |
| "epoch": 0.16499835001649985, |
| "grad_norm": 782.2625122070312, |
| "learning_rate": 3.947302634051182e-05, |
| "loss": 2.0984, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.16599834001659983, |
| "grad_norm": 237.81005859375, |
| "learning_rate": 3.946576531885636e-05, |
| "loss": 1.2758, |
| "step": 16600 |
| }, |
| { |
| "epoch": 0.16699833001669984, |
| "grad_norm": 148.71981811523438, |
| "learning_rate": 3.9458455292159883e-05, |
| "loss": 5.3672, |
| "step": 16700 |
| }, |
| { |
| "epoch": 0.16799832001679985, |
| "grad_norm": 339.80877685546875, |
| "learning_rate": 3.9451096278825386e-05, |
| "loss": 1.9513, |
| "step": 16800 |
| }, |
| { |
| "epoch": 0.16899831001689983, |
| "grad_norm": 339.36767578125, |
| "learning_rate": 3.944368829737918e-05, |
| "loss": 3.6129, |
| "step": 16900 |
| }, |
| { |
| "epoch": 0.16999830001699984, |
| "grad_norm": 45.14118194580078, |
| "learning_rate": 3.9436231366470836e-05, |
| "loss": 4.6565, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.17099829001709982, |
| "grad_norm": 47.00923156738281, |
| "learning_rate": 3.942872550487318e-05, |
| "loss": 3.9665, |
| "step": 17100 |
| }, |
| { |
| "epoch": 0.17199828001719983, |
| "grad_norm": 100.63198852539062, |
| "learning_rate": 3.942117073148221e-05, |
| "loss": 3.8538, |
| "step": 17200 |
| }, |
| { |
| "epoch": 0.17299827001729984, |
| "grad_norm": 13.572708129882812, |
| "learning_rate": 3.9413567065317056e-05, |
| "loss": 2.2025, |
| "step": 17300 |
| }, |
| { |
| "epoch": 0.17399826001739982, |
| "grad_norm": 411.76959228515625, |
| "learning_rate": 3.940591452551993e-05, |
| "loss": 1.3909, |
| "step": 17400 |
| }, |
| { |
| "epoch": 0.17499825001749983, |
| "grad_norm": 48.388797760009766, |
| "learning_rate": 3.93982131313561e-05, |
| "loss": 2.3617, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.1759982400175998, |
| "grad_norm": 107.87911224365234, |
| "learning_rate": 3.939046290221383e-05, |
| "loss": 2.1434, |
| "step": 17600 |
| }, |
| { |
| "epoch": 0.17699823001769982, |
| "grad_norm": 36.4320182800293, |
| "learning_rate": 3.938266385760429e-05, |
| "loss": 1.2506, |
| "step": 17700 |
| }, |
| { |
| "epoch": 0.17799822001779983, |
| "grad_norm": 293.2892150878906, |
| "learning_rate": 3.937481601716157e-05, |
| "loss": 3.2974, |
| "step": 17800 |
| }, |
| { |
| "epoch": 0.1789982100178998, |
| "grad_norm": 595.9169921875, |
| "learning_rate": 3.936691940064261e-05, |
| "loss": 3.2548, |
| "step": 17900 |
| }, |
| { |
| "epoch": 0.17999820001799982, |
| "grad_norm": 402.92236328125, |
| "learning_rate": 3.935897402792713e-05, |
| "loss": 1.4305, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.18099819001809983, |
| "grad_norm": 56.42055892944336, |
| "learning_rate": 3.935097991901759e-05, |
| "loss": 1.3576, |
| "step": 18100 |
| }, |
| { |
| "epoch": 0.1819981800181998, |
| "grad_norm": 38.25709533691406, |
| "learning_rate": 3.934293709403915e-05, |
| "loss": 2.9067, |
| "step": 18200 |
| }, |
| { |
| "epoch": 0.18299817001829982, |
| "grad_norm": 263.8206787109375, |
| "learning_rate": 3.933484557323961e-05, |
| "loss": 3.5546, |
| "step": 18300 |
| }, |
| { |
| "epoch": 0.1839981600183998, |
| "grad_norm": 190.02703857421875, |
| "learning_rate": 3.932670537698937e-05, |
| "loss": 3.0082, |
| "step": 18400 |
| }, |
| { |
| "epoch": 0.1849981500184998, |
| "grad_norm": 171.09291076660156, |
| "learning_rate": 3.931851652578137e-05, |
| "loss": 2.6617, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.18599814001859982, |
| "grad_norm": 202.73995971679688, |
| "learning_rate": 3.931027904023102e-05, |
| "loss": 2.818, |
| "step": 18600 |
| }, |
| { |
| "epoch": 0.1869981300186998, |
| "grad_norm": 179.5336456298828, |
| "learning_rate": 3.9301992941076185e-05, |
| "loss": 3.1686, |
| "step": 18700 |
| }, |
| { |
| "epoch": 0.1879981200187998, |
| "grad_norm": 396.6241455078125, |
| "learning_rate": 3.929365824917712e-05, |
| "loss": 1.8101, |
| "step": 18800 |
| }, |
| { |
| "epoch": 0.18899811001889982, |
| "grad_norm": 82.62757873535156, |
| "learning_rate": 3.928527498551639e-05, |
| "loss": 1.5857, |
| "step": 18900 |
| }, |
| { |
| "epoch": 0.1899981000189998, |
| "grad_norm": 733.2717895507812, |
| "learning_rate": 3.9276843171198844e-05, |
| "loss": 2.6652, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.19099809001909981, |
| "grad_norm": 960.3976440429688, |
| "learning_rate": 3.926836282745158e-05, |
| "loss": 3.8051, |
| "step": 19100 |
| }, |
| { |
| "epoch": 0.1919980800191998, |
| "grad_norm": 534.16748046875, |
| "learning_rate": 3.925983397562385e-05, |
| "loss": 2.8182, |
| "step": 19200 |
| }, |
| { |
| "epoch": 0.1929980700192998, |
| "grad_norm": 233.56008911132812, |
| "learning_rate": 3.925125663718703e-05, |
| "loss": 2.5726, |
| "step": 19300 |
| }, |
| { |
| "epoch": 0.19399806001939982, |
| "grad_norm": 450.2645263671875, |
| "learning_rate": 3.924263083373455e-05, |
| "loss": 1.7716, |
| "step": 19400 |
| }, |
| { |
| "epoch": 0.1949980500194998, |
| "grad_norm": 81.5189208984375, |
| "learning_rate": 3.923395658698186e-05, |
| "loss": 4.5006, |
| "step": 19500 |
| }, |
| { |
| "epoch": 0.1959980400195998, |
| "grad_norm": 84.00701141357422, |
| "learning_rate": 3.922523391876638e-05, |
| "loss": 0.5761, |
| "step": 19600 |
| }, |
| { |
| "epoch": 0.19699803001969982, |
| "grad_norm": 229.67648315429688, |
| "learning_rate": 3.9216462851047405e-05, |
| "loss": 5.6245, |
| "step": 19700 |
| }, |
| { |
| "epoch": 0.1979980200197998, |
| "grad_norm": 603.322265625, |
| "learning_rate": 3.9207643405906094e-05, |
| "loss": 3.7712, |
| "step": 19800 |
| }, |
| { |
| "epoch": 0.1989980100198998, |
| "grad_norm": 282.677734375, |
| "learning_rate": 3.9198775605545385e-05, |
| "loss": -0.2731, |
| "step": 19900 |
| }, |
| { |
| "epoch": 0.1999980000199998, |
| "grad_norm": 87.2725601196289, |
| "learning_rate": 3.9189859472289956e-05, |
| "loss": 2.8561, |
| "step": 20000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 200000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 10000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 5, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|