| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 4.423800044238001, |
| "eval_steps": 500, |
| "global_step": 20000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.022119000221190004, |
| "grad_norm": 14340.359375, |
| "learning_rate": 2.5e-06, |
| "loss": 6523.6544, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04423800044238001, |
| "grad_norm": 1850.3282470703125, |
| "learning_rate": 5e-06, |
| "loss": 350.7309, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.06635700066357, |
| "grad_norm": 1415.2620849609375, |
| "learning_rate": 7.5e-06, |
| "loss": 321.6054, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.08847600088476001, |
| "grad_norm": 3297.33984375, |
| "learning_rate": 1e-05, |
| "loss": 260.2301, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.11059500110595001, |
| "grad_norm": 2892.188232421875, |
| "learning_rate": 1.25e-05, |
| "loss": 308.3714, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.13271400132714, |
| "grad_norm": 1215.732666015625, |
| "learning_rate": 1.5e-05, |
| "loss": 343.5222, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.15483300154833002, |
| "grad_norm": 1504.814453125, |
| "learning_rate": 1.75e-05, |
| "loss": 301.771, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.17695200176952003, |
| "grad_norm": 714.8418579101562, |
| "learning_rate": 2e-05, |
| "loss": 290.3531, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.19907100199071, |
| "grad_norm": 2755.97998046875, |
| "learning_rate": 2.25e-05, |
| "loss": 179.3829, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.22119000221190002, |
| "grad_norm": 353.36248779296875, |
| "learning_rate": 2.5e-05, |
| "loss": 310.0189, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.24330900243309003, |
| "grad_norm": 418.4841613769531, |
| "learning_rate": 2.7500000000000004e-05, |
| "loss": 237.1839, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.26542800265428, |
| "grad_norm": 259.7605285644531, |
| "learning_rate": 3e-05, |
| "loss": 221.7796, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.28754700287547, |
| "grad_norm": 997.5048217773438, |
| "learning_rate": 3.2500000000000004e-05, |
| "loss": 345.3388, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.30966600309666004, |
| "grad_norm": 36418.90625, |
| "learning_rate": 3.5e-05, |
| "loss": 342.6086, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.33178500331785005, |
| "grad_norm": 863.9041748046875, |
| "learning_rate": 3.7500000000000003e-05, |
| "loss": 233.7479, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.35390400353904006, |
| "grad_norm": 622.4481811523438, |
| "learning_rate": 4e-05, |
| "loss": 134.1319, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.37602300376023, |
| "grad_norm": 699.130859375, |
| "learning_rate": 4.25e-05, |
| "loss": 192.9184, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.39814200398142, |
| "grad_norm": 893.842041015625, |
| "learning_rate": 4.5e-05, |
| "loss": 309.9599, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.42026100420261003, |
| "grad_norm": 478.645751953125, |
| "learning_rate": 4.75e-05, |
| "loss": 160.4654, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.44238000442380004, |
| "grad_norm": 833.3300170898438, |
| "learning_rate": 5e-05, |
| "loss": 184.012, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.46449900464499005, |
| "grad_norm": 7655.88037109375, |
| "learning_rate": 4.972222222222223e-05, |
| "loss": 257.931, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.48661800486618007, |
| "grad_norm": 466.54302978515625, |
| "learning_rate": 4.9444444444444446e-05, |
| "loss": 154.0851, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.5087370050873701, |
| "grad_norm": 948.7927856445312, |
| "learning_rate": 4.9166666666666665e-05, |
| "loss": 170.1584, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.53085600530856, |
| "grad_norm": 767.844482421875, |
| "learning_rate": 4.888888888888889e-05, |
| "loss": 179.1037, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.5529750055297501, |
| "grad_norm": 1689.810791015625, |
| "learning_rate": 4.8611111111111115e-05, |
| "loss": 197.1341, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.57509400575094, |
| "grad_norm": 1661.3369140625, |
| "learning_rate": 4.8333333333333334e-05, |
| "loss": 411.5847, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.59721300597213, |
| "grad_norm": 833.5604858398438, |
| "learning_rate": 4.805555555555556e-05, |
| "loss": 119.3549, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.6193320061933201, |
| "grad_norm": 562038.25, |
| "learning_rate": 4.7777777777777784e-05, |
| "loss": 245.144, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.64145100641451, |
| "grad_norm": 1275.5999755859375, |
| "learning_rate": 4.75e-05, |
| "loss": 197.1938, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.6635700066357001, |
| "grad_norm": 464.8160095214844, |
| "learning_rate": 4.722222222222222e-05, |
| "loss": 206.2149, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.68568900685689, |
| "grad_norm": 296.2396240234375, |
| "learning_rate": 4.6944444444444446e-05, |
| "loss": 146.9992, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.7078080070780801, |
| "grad_norm": 490.5688171386719, |
| "learning_rate": 4.666666666666667e-05, |
| "loss": 121.8763, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.7299270072992701, |
| "grad_norm": 320.01116943359375, |
| "learning_rate": 4.638888888888889e-05, |
| "loss": 82.6486, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.75204600752046, |
| "grad_norm": 269.9767150878906, |
| "learning_rate": 4.6111111111111115e-05, |
| "loss": 144.595, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.7741650077416501, |
| "grad_norm": 662.4879150390625, |
| "learning_rate": 4.5833333333333334e-05, |
| "loss": 140.9867, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.79628400796284, |
| "grad_norm": 882.5130615234375, |
| "learning_rate": 4.555555555555556e-05, |
| "loss": 130.253, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.8184030081840301, |
| "grad_norm": 631.7805786132812, |
| "learning_rate": 4.527777777777778e-05, |
| "loss": 95.0868, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.8405220084052201, |
| "grad_norm": 197.107421875, |
| "learning_rate": 4.5e-05, |
| "loss": 106.4198, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.8626410086264101, |
| "grad_norm": 3084.970947265625, |
| "learning_rate": 4.472222222222223e-05, |
| "loss": 145.6145, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.8847600088476001, |
| "grad_norm": 703.7938232421875, |
| "learning_rate": 4.4444444444444447e-05, |
| "loss": 133.7272, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.90687900906879, |
| "grad_norm": 546.3758544921875, |
| "learning_rate": 4.4166666666666665e-05, |
| "loss": 190.9529, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.9289980092899801, |
| "grad_norm": 415.634765625, |
| "learning_rate": 4.388888888888889e-05, |
| "loss": 149.3702, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.9511170095111701, |
| "grad_norm": 614.1394653320312, |
| "learning_rate": 4.3611111111111116e-05, |
| "loss": 148.1649, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.9732360097323601, |
| "grad_norm": 957.6664428710938, |
| "learning_rate": 4.3333333333333334e-05, |
| "loss": 134.7802, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.9953550099535501, |
| "grad_norm": 700.3091430664062, |
| "learning_rate": 4.305555555555556e-05, |
| "loss": 128.7425, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 229.09039306640625, |
| "eval_runtime": 13.3799, |
| "eval_samples_per_second": 150.226, |
| "eval_steps_per_second": 37.594, |
| "step": 4521 |
| }, |
| { |
| "epoch": 1.0174740101747402, |
| "grad_norm": 159.52822875976562, |
| "learning_rate": 4.277777777777778e-05, |
| "loss": 160.3704, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.03959301039593, |
| "grad_norm": 1282.0911865234375, |
| "learning_rate": 4.25e-05, |
| "loss": 135.6942, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.06171201061712, |
| "grad_norm": 542.7197265625, |
| "learning_rate": 4.222222222222222e-05, |
| "loss": 174.3882, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.08383101083831, |
| "grad_norm": 1587.159423828125, |
| "learning_rate": 4.194444444444445e-05, |
| "loss": 153.1831, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.1059500110595002, |
| "grad_norm": 695.9743041992188, |
| "learning_rate": 4.166666666666667e-05, |
| "loss": 109.4728, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.1280690112806901, |
| "grad_norm": 515.8330078125, |
| "learning_rate": 4.138888888888889e-05, |
| "loss": 136.7678, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.15018801150188, |
| "grad_norm": 586.1018676757812, |
| "learning_rate": 4.111111111111111e-05, |
| "loss": 84.0752, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.17230701172307, |
| "grad_norm": 995.541259765625, |
| "learning_rate": 4.0833333333333334e-05, |
| "loss": 155.4204, |
| "step": 5300 |
| }, |
| { |
| "epoch": 1.1944260119442602, |
| "grad_norm": 742.328125, |
| "learning_rate": 4.055555555555556e-05, |
| "loss": 72.3888, |
| "step": 5400 |
| }, |
| { |
| "epoch": 1.2165450121654502, |
| "grad_norm": 684.4664916992188, |
| "learning_rate": 4.027777777777778e-05, |
| "loss": 144.5675, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.2386640123866401, |
| "grad_norm": 810.091552734375, |
| "learning_rate": 4e-05, |
| "loss": 105.8335, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.26078301260783, |
| "grad_norm": 1068.27783203125, |
| "learning_rate": 3.972222222222222e-05, |
| "loss": 108.972, |
| "step": 5700 |
| }, |
| { |
| "epoch": 1.28290201282902, |
| "grad_norm": 902.8329467773438, |
| "learning_rate": 3.944444444444445e-05, |
| "loss": 114.8165, |
| "step": 5800 |
| }, |
| { |
| "epoch": 1.3050210130502102, |
| "grad_norm": 1491.9244384765625, |
| "learning_rate": 3.9166666666666665e-05, |
| "loss": 88.0044, |
| "step": 5900 |
| }, |
| { |
| "epoch": 1.3271400132714002, |
| "grad_norm": 441.6331481933594, |
| "learning_rate": 3.888888888888889e-05, |
| "loss": 105.6323, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.3492590134925901, |
| "grad_norm": 121.39036560058594, |
| "learning_rate": 3.8611111111111116e-05, |
| "loss": 140.8393, |
| "step": 6100 |
| }, |
| { |
| "epoch": 1.37137801371378, |
| "grad_norm": 839.1984252929688, |
| "learning_rate": 3.8333333333333334e-05, |
| "loss": 97.4277, |
| "step": 6200 |
| }, |
| { |
| "epoch": 1.39349701393497, |
| "grad_norm": 612.2739868164062, |
| "learning_rate": 3.805555555555555e-05, |
| "loss": 80.9864, |
| "step": 6300 |
| }, |
| { |
| "epoch": 1.4156160141561602, |
| "grad_norm": 588.4722290039062, |
| "learning_rate": 3.777777777777778e-05, |
| "loss": 104.521, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.4377350143773502, |
| "grad_norm": 325.037353515625, |
| "learning_rate": 3.7500000000000003e-05, |
| "loss": 131.7437, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.4598540145985401, |
| "grad_norm": 196.6481170654297, |
| "learning_rate": 3.722222222222222e-05, |
| "loss": 117.8643, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.48197301481973, |
| "grad_norm": 424.9925231933594, |
| "learning_rate": 3.694444444444445e-05, |
| "loss": 130.4872, |
| "step": 6700 |
| }, |
| { |
| "epoch": 1.50409201504092, |
| "grad_norm": 796.464599609375, |
| "learning_rate": 3.6666666666666666e-05, |
| "loss": 115.8529, |
| "step": 6800 |
| }, |
| { |
| "epoch": 1.5262110152621102, |
| "grad_norm": 2076.1787109375, |
| "learning_rate": 3.638888888888889e-05, |
| "loss": 185.1337, |
| "step": 6900 |
| }, |
| { |
| "epoch": 1.5483300154833002, |
| "grad_norm": 383.5282287597656, |
| "learning_rate": 3.611111111111111e-05, |
| "loss": 78.6264, |
| "step": 7000 |
| }, |
| { |
| "epoch": 1.5704490157044901, |
| "grad_norm": 591.9740600585938, |
| "learning_rate": 3.5833333333333335e-05, |
| "loss": 86.4212, |
| "step": 7100 |
| }, |
| { |
| "epoch": 1.5925680159256803, |
| "grad_norm": 815.822998046875, |
| "learning_rate": 3.555555555555556e-05, |
| "loss": 105.2421, |
| "step": 7200 |
| }, |
| { |
| "epoch": 1.61468701614687, |
| "grad_norm": 698.5433959960938, |
| "learning_rate": 3.527777777777778e-05, |
| "loss": 107.0973, |
| "step": 7300 |
| }, |
| { |
| "epoch": 1.6368060163680602, |
| "grad_norm": 260.39154052734375, |
| "learning_rate": 3.5e-05, |
| "loss": 102.5785, |
| "step": 7400 |
| }, |
| { |
| "epoch": 1.6589250165892502, |
| "grad_norm": 8398.65234375, |
| "learning_rate": 3.472222222222222e-05, |
| "loss": 145.6558, |
| "step": 7500 |
| }, |
| { |
| "epoch": 1.6810440168104401, |
| "grad_norm": 929.0120239257812, |
| "learning_rate": 3.444444444444445e-05, |
| "loss": 93.9839, |
| "step": 7600 |
| }, |
| { |
| "epoch": 1.7031630170316303, |
| "grad_norm": 627.1079711914062, |
| "learning_rate": 3.4166666666666666e-05, |
| "loss": 125.3684, |
| "step": 7700 |
| }, |
| { |
| "epoch": 1.72528201725282, |
| "grad_norm": 408.4013671875, |
| "learning_rate": 3.388888888888889e-05, |
| "loss": 91.5595, |
| "step": 7800 |
| }, |
| { |
| "epoch": 1.7474010174740102, |
| "grad_norm": 304.0116882324219, |
| "learning_rate": 3.3611111111111116e-05, |
| "loss": 79.996, |
| "step": 7900 |
| }, |
| { |
| "epoch": 1.7695200176952002, |
| "grad_norm": 985.1416625976562, |
| "learning_rate": 3.3333333333333335e-05, |
| "loss": 106.5055, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.7916390179163901, |
| "grad_norm": 381.7845764160156, |
| "learning_rate": 3.3055555555555553e-05, |
| "loss": 184.2213, |
| "step": 8100 |
| }, |
| { |
| "epoch": 1.8137580181375803, |
| "grad_norm": 739.7052612304688, |
| "learning_rate": 3.277777777777778e-05, |
| "loss": 81.3863, |
| "step": 8200 |
| }, |
| { |
| "epoch": 1.83587701835877, |
| "grad_norm": 315.7475280761719, |
| "learning_rate": 3.2500000000000004e-05, |
| "loss": 92.0976, |
| "step": 8300 |
| }, |
| { |
| "epoch": 1.8579960185799602, |
| "grad_norm": 326.6717529296875, |
| "learning_rate": 3.222222222222223e-05, |
| "loss": 112.162, |
| "step": 8400 |
| }, |
| { |
| "epoch": 1.8801150188011502, |
| "grad_norm": 322.20086669921875, |
| "learning_rate": 3.194444444444444e-05, |
| "loss": 111.4809, |
| "step": 8500 |
| }, |
| { |
| "epoch": 1.9022340190223401, |
| "grad_norm": 1028.286376953125, |
| "learning_rate": 3.1666666666666666e-05, |
| "loss": 56.9394, |
| "step": 8600 |
| }, |
| { |
| "epoch": 1.9243530192435303, |
| "grad_norm": 696.5010986328125, |
| "learning_rate": 3.138888888888889e-05, |
| "loss": 100.1622, |
| "step": 8700 |
| }, |
| { |
| "epoch": 1.94647201946472, |
| "grad_norm": 227.0893096923828, |
| "learning_rate": 3.111111111111111e-05, |
| "loss": 102.4857, |
| "step": 8800 |
| }, |
| { |
| "epoch": 1.9685910196859102, |
| "grad_norm": 571.6776123046875, |
| "learning_rate": 3.0833333333333335e-05, |
| "loss": 97.7735, |
| "step": 8900 |
| }, |
| { |
| "epoch": 1.9907100199071002, |
| "grad_norm": 2159.34716796875, |
| "learning_rate": 3.055555555555556e-05, |
| "loss": 138.7627, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 284.751708984375, |
| "eval_runtime": 13.2962, |
| "eval_samples_per_second": 151.171, |
| "eval_steps_per_second": 37.83, |
| "step": 9042 |
| }, |
| { |
| "epoch": 2.01282902012829, |
| "grad_norm": 425.97235107421875, |
| "learning_rate": 3.0277777777777776e-05, |
| "loss": 110.9788, |
| "step": 9100 |
| }, |
| { |
| "epoch": 2.0349480203494803, |
| "grad_norm": 428.1395568847656, |
| "learning_rate": 3e-05, |
| "loss": 88.1351, |
| "step": 9200 |
| }, |
| { |
| "epoch": 2.05706702057067, |
| "grad_norm": 409.4471130371094, |
| "learning_rate": 2.9722222222222223e-05, |
| "loss": 99.0849, |
| "step": 9300 |
| }, |
| { |
| "epoch": 2.07918602079186, |
| "grad_norm": 296.4221496582031, |
| "learning_rate": 2.9444444444444448e-05, |
| "loss": 104.3382, |
| "step": 9400 |
| }, |
| { |
| "epoch": 2.1013050210130504, |
| "grad_norm": 740.9823608398438, |
| "learning_rate": 2.916666666666667e-05, |
| "loss": 72.3218, |
| "step": 9500 |
| }, |
| { |
| "epoch": 2.12342402123424, |
| "grad_norm": 547.3069458007812, |
| "learning_rate": 2.8888888888888888e-05, |
| "loss": 56.0866, |
| "step": 9600 |
| }, |
| { |
| "epoch": 2.1455430214554303, |
| "grad_norm": 448.69793701171875, |
| "learning_rate": 2.861111111111111e-05, |
| "loss": 92.5643, |
| "step": 9700 |
| }, |
| { |
| "epoch": 2.16766202167662, |
| "grad_norm": 626.5888671875, |
| "learning_rate": 2.8333333333333335e-05, |
| "loss": 90.8, |
| "step": 9800 |
| }, |
| { |
| "epoch": 2.18978102189781, |
| "grad_norm": 935.4774780273438, |
| "learning_rate": 2.8055555555555557e-05, |
| "loss": 56.0015, |
| "step": 9900 |
| }, |
| { |
| "epoch": 2.2119000221190004, |
| "grad_norm": 578.9419555664062, |
| "learning_rate": 2.777777777777778e-05, |
| "loss": 63.4371, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.23401902234019, |
| "grad_norm": 434.9461364746094, |
| "learning_rate": 2.7500000000000004e-05, |
| "loss": 81.4173, |
| "step": 10100 |
| }, |
| { |
| "epoch": 2.2561380225613803, |
| "grad_norm": 823.6088256835938, |
| "learning_rate": 2.7222222222222223e-05, |
| "loss": 91.1232, |
| "step": 10200 |
| }, |
| { |
| "epoch": 2.2782570227825705, |
| "grad_norm": 12951.28515625, |
| "learning_rate": 2.6944444444444445e-05, |
| "loss": 83.2001, |
| "step": 10300 |
| }, |
| { |
| "epoch": 2.30037602300376, |
| "grad_norm": 466.7561950683594, |
| "learning_rate": 2.6666666666666667e-05, |
| "loss": 105.8661, |
| "step": 10400 |
| }, |
| { |
| "epoch": 2.3224950232249504, |
| "grad_norm": 439.0936584472656, |
| "learning_rate": 2.6388888888888892e-05, |
| "loss": 131.327, |
| "step": 10500 |
| }, |
| { |
| "epoch": 2.34461402344614, |
| "grad_norm": 822.1676635742188, |
| "learning_rate": 2.6111111111111114e-05, |
| "loss": 93.9335, |
| "step": 10600 |
| }, |
| { |
| "epoch": 2.3667330236673303, |
| "grad_norm": 1097.796875, |
| "learning_rate": 2.5833333333333336e-05, |
| "loss": 114.3786, |
| "step": 10700 |
| }, |
| { |
| "epoch": 2.3888520238885205, |
| "grad_norm": 513.4930419921875, |
| "learning_rate": 2.5555555555555554e-05, |
| "loss": 117.5649, |
| "step": 10800 |
| }, |
| { |
| "epoch": 2.41097102410971, |
| "grad_norm": 488.8702087402344, |
| "learning_rate": 2.527777777777778e-05, |
| "loss": 93.1184, |
| "step": 10900 |
| }, |
| { |
| "epoch": 2.4330900243309004, |
| "grad_norm": 16564.607421875, |
| "learning_rate": 2.5e-05, |
| "loss": 101.7624, |
| "step": 11000 |
| }, |
| { |
| "epoch": 2.45520902455209, |
| "grad_norm": 905.677978515625, |
| "learning_rate": 2.4722222222222223e-05, |
| "loss": 110.5879, |
| "step": 11100 |
| }, |
| { |
| "epoch": 2.4773280247732803, |
| "grad_norm": 289.8318786621094, |
| "learning_rate": 2.4444444444444445e-05, |
| "loss": 61.5242, |
| "step": 11200 |
| }, |
| { |
| "epoch": 2.4994470249944705, |
| "grad_norm": 457.4600524902344, |
| "learning_rate": 2.4166666666666667e-05, |
| "loss": 92.5947, |
| "step": 11300 |
| }, |
| { |
| "epoch": 2.52156602521566, |
| "grad_norm": 1546.554931640625, |
| "learning_rate": 2.3888888888888892e-05, |
| "loss": 131.2207, |
| "step": 11400 |
| }, |
| { |
| "epoch": 2.5436850254368504, |
| "grad_norm": 452.04412841796875, |
| "learning_rate": 2.361111111111111e-05, |
| "loss": 66.9818, |
| "step": 11500 |
| }, |
| { |
| "epoch": 2.56580402565804, |
| "grad_norm": 7368.85595703125, |
| "learning_rate": 2.3333333333333336e-05, |
| "loss": 128.0609, |
| "step": 11600 |
| }, |
| { |
| "epoch": 2.5879230258792303, |
| "grad_norm": 833.9178466796875, |
| "learning_rate": 2.3055555555555558e-05, |
| "loss": 82.7991, |
| "step": 11700 |
| }, |
| { |
| "epoch": 2.6100420261004205, |
| "grad_norm": 488.8749084472656, |
| "learning_rate": 2.277777777777778e-05, |
| "loss": 70.6396, |
| "step": 11800 |
| }, |
| { |
| "epoch": 2.63216102632161, |
| "grad_norm": 525.4971923828125, |
| "learning_rate": 2.25e-05, |
| "loss": 81.1729, |
| "step": 11900 |
| }, |
| { |
| "epoch": 2.6542800265428004, |
| "grad_norm": 506.8924865722656, |
| "learning_rate": 2.2222222222222223e-05, |
| "loss": 70.9919, |
| "step": 12000 |
| }, |
| { |
| "epoch": 2.67639902676399, |
| "grad_norm": 272.2956848144531, |
| "learning_rate": 2.1944444444444445e-05, |
| "loss": 83.741, |
| "step": 12100 |
| }, |
| { |
| "epoch": 2.6985180269851803, |
| "grad_norm": 153.6388702392578, |
| "learning_rate": 2.1666666666666667e-05, |
| "loss": 82.1675, |
| "step": 12200 |
| }, |
| { |
| "epoch": 2.7206370272063705, |
| "grad_norm": 668.1585693359375, |
| "learning_rate": 2.138888888888889e-05, |
| "loss": 93.7683, |
| "step": 12300 |
| }, |
| { |
| "epoch": 2.74275602742756, |
| "grad_norm": 386.3387756347656, |
| "learning_rate": 2.111111111111111e-05, |
| "loss": 69.9565, |
| "step": 12400 |
| }, |
| { |
| "epoch": 2.7648750276487504, |
| "grad_norm": 456.8686828613281, |
| "learning_rate": 2.0833333333333336e-05, |
| "loss": 86.7724, |
| "step": 12500 |
| }, |
| { |
| "epoch": 2.78699402786994, |
| "grad_norm": 300.98541259765625, |
| "learning_rate": 2.0555555555555555e-05, |
| "loss": 59.863, |
| "step": 12600 |
| }, |
| { |
| "epoch": 2.8091130280911303, |
| "grad_norm": 434.6181945800781, |
| "learning_rate": 2.027777777777778e-05, |
| "loss": 92.5719, |
| "step": 12700 |
| }, |
| { |
| "epoch": 2.8312320283123205, |
| "grad_norm": 1306.5218505859375, |
| "learning_rate": 2e-05, |
| "loss": 72.4254, |
| "step": 12800 |
| }, |
| { |
| "epoch": 2.85335102853351, |
| "grad_norm": 255.54293823242188, |
| "learning_rate": 1.9722222222222224e-05, |
| "loss": 55.263, |
| "step": 12900 |
| }, |
| { |
| "epoch": 2.8754700287547004, |
| "grad_norm": 1816.9591064453125, |
| "learning_rate": 1.9444444444444445e-05, |
| "loss": 72.7496, |
| "step": 13000 |
| }, |
| { |
| "epoch": 2.89758902897589, |
| "grad_norm": 831.1156616210938, |
| "learning_rate": 1.9166666666666667e-05, |
| "loss": 92.1811, |
| "step": 13100 |
| }, |
| { |
| "epoch": 2.9197080291970803, |
| "grad_norm": 2121.381591796875, |
| "learning_rate": 1.888888888888889e-05, |
| "loss": 84.3183, |
| "step": 13200 |
| }, |
| { |
| "epoch": 2.9418270294182705, |
| "grad_norm": 348.0662841796875, |
| "learning_rate": 1.861111111111111e-05, |
| "loss": 79.233, |
| "step": 13300 |
| }, |
| { |
| "epoch": 2.96394602963946, |
| "grad_norm": 502.2306823730469, |
| "learning_rate": 1.8333333333333333e-05, |
| "loss": 86.6864, |
| "step": 13400 |
| }, |
| { |
| "epoch": 2.9860650298606504, |
| "grad_norm": 498.3459777832031, |
| "learning_rate": 1.8055555555555555e-05, |
| "loss": 67.7695, |
| "step": 13500 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 190.48678588867188, |
| "eval_runtime": 13.6446, |
| "eval_samples_per_second": 147.311, |
| "eval_steps_per_second": 36.864, |
| "step": 13563 |
| }, |
| { |
| "epoch": 3.00818403008184, |
| "grad_norm": 691.843505859375, |
| "learning_rate": 1.777777777777778e-05, |
| "loss": 85.3921, |
| "step": 13600 |
| }, |
| { |
| "epoch": 3.0303030303030303, |
| "grad_norm": 1307.369140625, |
| "learning_rate": 1.75e-05, |
| "loss": 68.4007, |
| "step": 13700 |
| }, |
| { |
| "epoch": 3.0524220305242205, |
| "grad_norm": 329.44451904296875, |
| "learning_rate": 1.7222222222222224e-05, |
| "loss": 74.87, |
| "step": 13800 |
| }, |
| { |
| "epoch": 3.07454103074541, |
| "grad_norm": 2009.1265869140625, |
| "learning_rate": 1.6944444444444446e-05, |
| "loss": 78.6698, |
| "step": 13900 |
| }, |
| { |
| "epoch": 3.0966600309666004, |
| "grad_norm": 893.1031494140625, |
| "learning_rate": 1.6666666666666667e-05, |
| "loss": 72.2353, |
| "step": 14000 |
| }, |
| { |
| "epoch": 3.11877903118779, |
| "grad_norm": 455.1803283691406, |
| "learning_rate": 1.638888888888889e-05, |
| "loss": 54.7878, |
| "step": 14100 |
| }, |
| { |
| "epoch": 3.1408980314089803, |
| "grad_norm": 494.5647888183594, |
| "learning_rate": 1.6111111111111115e-05, |
| "loss": 70.0231, |
| "step": 14200 |
| }, |
| { |
| "epoch": 3.1630170316301705, |
| "grad_norm": 356.6091613769531, |
| "learning_rate": 1.5833333333333333e-05, |
| "loss": 77.7633, |
| "step": 14300 |
| }, |
| { |
| "epoch": 3.18513603185136, |
| "grad_norm": 1459.70458984375, |
| "learning_rate": 1.5555555555555555e-05, |
| "loss": 85.1146, |
| "step": 14400 |
| }, |
| { |
| "epoch": 3.2072550320725504, |
| "grad_norm": 1403.0411376953125, |
| "learning_rate": 1.527777777777778e-05, |
| "loss": 81.8396, |
| "step": 14500 |
| }, |
| { |
| "epoch": 3.22937403229374, |
| "grad_norm": 618.7579345703125, |
| "learning_rate": 1.5e-05, |
| "loss": 60.2011, |
| "step": 14600 |
| }, |
| { |
| "epoch": 3.2514930325149303, |
| "grad_norm": 283.14459228515625, |
| "learning_rate": 1.4722222222222224e-05, |
| "loss": 51.0029, |
| "step": 14700 |
| }, |
| { |
| "epoch": 3.2736120327361204, |
| "grad_norm": 217.41241455078125, |
| "learning_rate": 1.4444444444444444e-05, |
| "loss": 74.5451, |
| "step": 14800 |
| }, |
| { |
| "epoch": 3.29573103295731, |
| "grad_norm": 205.61912536621094, |
| "learning_rate": 1.4166666666666668e-05, |
| "loss": 89.2503, |
| "step": 14900 |
| }, |
| { |
| "epoch": 3.3178500331785004, |
| "grad_norm": 1901.0181884765625, |
| "learning_rate": 1.388888888888889e-05, |
| "loss": 60.1773, |
| "step": 15000 |
| }, |
| { |
| "epoch": 3.33996903339969, |
| "grad_norm": 3059.44677734375, |
| "learning_rate": 1.3611111111111111e-05, |
| "loss": 74.0471, |
| "step": 15100 |
| }, |
| { |
| "epoch": 3.3620880336208803, |
| "grad_norm": 742.3741455078125, |
| "learning_rate": 1.3333333333333333e-05, |
| "loss": 70.3482, |
| "step": 15200 |
| }, |
| { |
| "epoch": 3.3842070338420704, |
| "grad_norm": 1228.872802734375, |
| "learning_rate": 1.3055555555555557e-05, |
| "loss": 70.8862, |
| "step": 15300 |
| }, |
| { |
| "epoch": 3.40632603406326, |
| "grad_norm": 255.646728515625, |
| "learning_rate": 1.2777777777777777e-05, |
| "loss": 91.8017, |
| "step": 15400 |
| }, |
| { |
| "epoch": 3.4284450342844504, |
| "grad_norm": 112406.4921875, |
| "learning_rate": 1.25e-05, |
| "loss": 134.3668, |
| "step": 15500 |
| }, |
| { |
| "epoch": 3.4505640345056405, |
| "grad_norm": 3105.224609375, |
| "learning_rate": 1.2222222222222222e-05, |
| "loss": 65.372, |
| "step": 15600 |
| }, |
| { |
| "epoch": 3.4726830347268303, |
| "grad_norm": 357.5060119628906, |
| "learning_rate": 1.1944444444444446e-05, |
| "loss": 76.7241, |
| "step": 15700 |
| }, |
| { |
| "epoch": 3.4948020349480204, |
| "grad_norm": 347.73797607421875, |
| "learning_rate": 1.1666666666666668e-05, |
| "loss": 66.5545, |
| "step": 15800 |
| }, |
| { |
| "epoch": 3.5169210351692106, |
| "grad_norm": 747.213134765625, |
| "learning_rate": 1.138888888888889e-05, |
| "loss": 85.4001, |
| "step": 15900 |
| }, |
| { |
| "epoch": 3.5390400353904004, |
| "grad_norm": 637.4097290039062, |
| "learning_rate": 1.1111111111111112e-05, |
| "loss": 67.5741, |
| "step": 16000 |
| }, |
| { |
| "epoch": 3.56115903561159, |
| "grad_norm": 539.6690673828125, |
| "learning_rate": 1.0833333333333334e-05, |
| "loss": 58.0714, |
| "step": 16100 |
| }, |
| { |
| "epoch": 3.5832780358327803, |
| "grad_norm": 164.90731811523438, |
| "learning_rate": 1.0555555555555555e-05, |
| "loss": 71.1992, |
| "step": 16200 |
| }, |
| { |
| "epoch": 3.6053970360539704, |
| "grad_norm": 1147.67822265625, |
| "learning_rate": 1.0277777777777777e-05, |
| "loss": 69.1552, |
| "step": 16300 |
| }, |
| { |
| "epoch": 3.6275160362751606, |
| "grad_norm": 1046.4427490234375, |
| "learning_rate": 1e-05, |
| "loss": 76.3746, |
| "step": 16400 |
| }, |
| { |
| "epoch": 3.6496350364963503, |
| "grad_norm": 694.0726318359375, |
| "learning_rate": 9.722222222222223e-06, |
| "loss": 72.2259, |
| "step": 16500 |
| }, |
| { |
| "epoch": 3.6717540367175405, |
| "grad_norm": 791.6326293945312, |
| "learning_rate": 9.444444444444445e-06, |
| "loss": 61.315, |
| "step": 16600 |
| }, |
| { |
| "epoch": 3.6938730369387303, |
| "grad_norm": 2267.64501953125, |
| "learning_rate": 9.166666666666666e-06, |
| "loss": 70.8261, |
| "step": 16700 |
| }, |
| { |
| "epoch": 3.7159920371599204, |
| "grad_norm": 420.0555114746094, |
| "learning_rate": 8.88888888888889e-06, |
| "loss": 82.5815, |
| "step": 16800 |
| }, |
| { |
| "epoch": 3.7381110373811106, |
| "grad_norm": 846.6657104492188, |
| "learning_rate": 8.611111111111112e-06, |
| "loss": 76.4155, |
| "step": 16900 |
| }, |
| { |
| "epoch": 3.7602300376023003, |
| "grad_norm": 518.072998046875, |
| "learning_rate": 8.333333333333334e-06, |
| "loss": 77.4135, |
| "step": 17000 |
| }, |
| { |
| "epoch": 3.7823490378234905, |
| "grad_norm": 1067.0804443359375, |
| "learning_rate": 8.055555555555557e-06, |
| "loss": 78.471, |
| "step": 17100 |
| }, |
| { |
| "epoch": 3.8044680380446803, |
| "grad_norm": 1737.0146484375, |
| "learning_rate": 7.777777777777777e-06, |
| "loss": 57.732, |
| "step": 17200 |
| }, |
| { |
| "epoch": 3.8265870382658704, |
| "grad_norm": 377.42120361328125, |
| "learning_rate": 7.5e-06, |
| "loss": 51.0151, |
| "step": 17300 |
| }, |
| { |
| "epoch": 3.8487060384870606, |
| "grad_norm": 544.689697265625, |
| "learning_rate": 7.222222222222222e-06, |
| "loss": 73.9021, |
| "step": 17400 |
| }, |
| { |
| "epoch": 3.8708250387082503, |
| "grad_norm": 198.7846221923828, |
| "learning_rate": 6.944444444444445e-06, |
| "loss": 55.6872, |
| "step": 17500 |
| }, |
| { |
| "epoch": 3.8929440389294405, |
| "grad_norm": 169.698486328125, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 77.5166, |
| "step": 17600 |
| }, |
| { |
| "epoch": 3.9150630391506303, |
| "grad_norm": 671.6690673828125, |
| "learning_rate": 6.3888888888888885e-06, |
| "loss": 71.126, |
| "step": 17700 |
| }, |
| { |
| "epoch": 3.9371820393718204, |
| "grad_norm": 267.1748046875, |
| "learning_rate": 6.111111111111111e-06, |
| "loss": 68.3726, |
| "step": 17800 |
| }, |
| { |
| "epoch": 3.9593010395930106, |
| "grad_norm": 958.6556396484375, |
| "learning_rate": 5.833333333333334e-06, |
| "loss": 48.5082, |
| "step": 17900 |
| }, |
| { |
| "epoch": 3.9814200398142003, |
| "grad_norm": 981.7296142578125, |
| "learning_rate": 5.555555555555556e-06, |
| "loss": 75.4993, |
| "step": 18000 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 184.00929260253906, |
| "eval_runtime": 13.5718, |
| "eval_samples_per_second": 148.101, |
| "eval_steps_per_second": 37.062, |
| "step": 18084 |
| }, |
| { |
| "epoch": 4.00353904003539, |
| "grad_norm": 1069.6796875, |
| "learning_rate": 5.277777777777778e-06, |
| "loss": 68.5521, |
| "step": 18100 |
| }, |
| { |
| "epoch": 4.02565804025658, |
| "grad_norm": 358.9207763671875, |
| "learning_rate": 5e-06, |
| "loss": 61.5938, |
| "step": 18200 |
| }, |
| { |
| "epoch": 4.04777704047777, |
| "grad_norm": 1874.0775146484375, |
| "learning_rate": 4.722222222222222e-06, |
| "loss": 61.3293, |
| "step": 18300 |
| }, |
| { |
| "epoch": 4.069896040698961, |
| "grad_norm": 1064.0521240234375, |
| "learning_rate": 4.444444444444445e-06, |
| "loss": 61.2526, |
| "step": 18400 |
| }, |
| { |
| "epoch": 4.092015040920151, |
| "grad_norm": 530.5929565429688, |
| "learning_rate": 4.166666666666667e-06, |
| "loss": 70.5009, |
| "step": 18500 |
| }, |
| { |
| "epoch": 4.11413404114134, |
| "grad_norm": 630.8464965820312, |
| "learning_rate": 3.888888888888889e-06, |
| "loss": 73.1008, |
| "step": 18600 |
| }, |
| { |
| "epoch": 4.13625304136253, |
| "grad_norm": 806.4486083984375, |
| "learning_rate": 3.611111111111111e-06, |
| "loss": 61.8402, |
| "step": 18700 |
| }, |
| { |
| "epoch": 4.15837204158372, |
| "grad_norm": 267.43902587890625, |
| "learning_rate": 3.3333333333333333e-06, |
| "loss": 50.5993, |
| "step": 18800 |
| }, |
| { |
| "epoch": 4.180491041804911, |
| "grad_norm": 289.7193298339844, |
| "learning_rate": 3.0555555555555556e-06, |
| "loss": 82.3094, |
| "step": 18900 |
| }, |
| { |
| "epoch": 4.202610042026101, |
| "grad_norm": 287.5179748535156, |
| "learning_rate": 2.777777777777778e-06, |
| "loss": 67.3702, |
| "step": 19000 |
| }, |
| { |
| "epoch": 4.22472904224729, |
| "grad_norm": 723.5756225585938, |
| "learning_rate": 2.5e-06, |
| "loss": 69.6824, |
| "step": 19100 |
| }, |
| { |
| "epoch": 4.24684804246848, |
| "grad_norm": 1228.8450927734375, |
| "learning_rate": 2.2222222222222225e-06, |
| "loss": 67.9151, |
| "step": 19200 |
| }, |
| { |
| "epoch": 4.26896704268967, |
| "grad_norm": 187.32969665527344, |
| "learning_rate": 1.9444444444444444e-06, |
| "loss": 63.4367, |
| "step": 19300 |
| }, |
| { |
| "epoch": 4.291086042910861, |
| "grad_norm": 354.1274108886719, |
| "learning_rate": 1.6666666666666667e-06, |
| "loss": 74.4323, |
| "step": 19400 |
| }, |
| { |
| "epoch": 4.313205043132051, |
| "grad_norm": 1628.03857421875, |
| "learning_rate": 1.388888888888889e-06, |
| "loss": 74.6173, |
| "step": 19500 |
| }, |
| { |
| "epoch": 4.33532404335324, |
| "grad_norm": 286.3065185546875, |
| "learning_rate": 1.1111111111111112e-06, |
| "loss": 66.8454, |
| "step": 19600 |
| }, |
| { |
| "epoch": 4.35744304357443, |
| "grad_norm": 140.7686767578125, |
| "learning_rate": 8.333333333333333e-07, |
| "loss": 40.7083, |
| "step": 19700 |
| }, |
| { |
| "epoch": 4.37956204379562, |
| "grad_norm": 907.8124389648438, |
| "learning_rate": 5.555555555555556e-07, |
| "loss": 79.8728, |
| "step": 19800 |
| }, |
| { |
| "epoch": 4.401681044016811, |
| "grad_norm": 498.08941650390625, |
| "learning_rate": 2.777777777777778e-07, |
| "loss": 68.5423, |
| "step": 19900 |
| }, |
| { |
| "epoch": 4.423800044238001, |
| "grad_norm": 380.51446533203125, |
| "learning_rate": 0.0, |
| "loss": 58.604, |
| "step": 20000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 20000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 10000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|