| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.96, | |
| "eval_steps": 500, | |
| "global_step": 2400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0004, | |
| "grad_norm": 8.119571685791016, | |
| "learning_rate": 0.0, | |
| "loss": 3.4677, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.004, | |
| "grad_norm": 8.098031997680664, | |
| "learning_rate": 9e-06, | |
| "loss": 4.4682, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 8.463135719299316, | |
| "learning_rate": 1.9e-05, | |
| "loss": 4.6132, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "grad_norm": 7.452148914337158, | |
| "learning_rate": 2.9e-05, | |
| "loss": 4.223, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 5.572236061096191, | |
| "learning_rate": 3.9000000000000006e-05, | |
| "loss": 4.4111, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 7.39447021484375, | |
| "learning_rate": 4.9e-05, | |
| "loss": 4.0543, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 9.954078674316406, | |
| "learning_rate": 4.999833521640187e-05, | |
| "loss": 4.5007, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "grad_norm": 5.994736194610596, | |
| "learning_rate": 4.9992580693557054e-05, | |
| "loss": 4.6204, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 7.18951940536499, | |
| "learning_rate": 4.998271682453017e-05, | |
| "loss": 4.2467, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "grad_norm": 9.591761589050293, | |
| "learning_rate": 4.996874523116464e-05, | |
| "loss": 4.4063, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 7.566534519195557, | |
| "learning_rate": 4.995066821070679e-05, | |
| "loss": 4.0773, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "grad_norm": 5.793056488037109, | |
| "learning_rate": 4.9928488735428105e-05, | |
| "loss": 4.063, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 7.917051792144775, | |
| "learning_rate": 4.990221045213652e-05, | |
| "loss": 4.2533, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "grad_norm": 7.804363250732422, | |
| "learning_rate": 4.987183768157686e-05, | |
| "loss": 4.0497, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 7.605108261108398, | |
| "learning_rate": 4.983737541772033e-05, | |
| "loss": 4.4334, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 9.323838233947754, | |
| "learning_rate": 4.979882932694346e-05, | |
| "loss": 4.0412, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 9.219818115234375, | |
| "learning_rate": 4.9756205747096385e-05, | |
| "loss": 3.9774, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.068, | |
| "grad_norm": 8.375937461853027, | |
| "learning_rate": 4.9709511686460775e-05, | |
| "loss": 4.0021, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 8.215436935424805, | |
| "learning_rate": 4.96587548225975e-05, | |
| "loss": 4.4227, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.076, | |
| "grad_norm": 9.561295509338379, | |
| "learning_rate": 4.960394350108429e-05, | |
| "loss": 4.1091, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 9.014528274536133, | |
| "learning_rate": 4.954508673414351e-05, | |
| "loss": 3.9428, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.084, | |
| "grad_norm": 7.69431209564209, | |
| "learning_rate": 4.948219419916037e-05, | |
| "loss": 4.368, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 11.925583839416504, | |
| "learning_rate": 4.941527623709172e-05, | |
| "loss": 3.6757, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.092, | |
| "grad_norm": 8.191117286682129, | |
| "learning_rate": 4.934434385076576e-05, | |
| "loss": 4.1905, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 7.863613128662109, | |
| "learning_rate": 4.926940870307296e-05, | |
| "loss": 4.0099, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 10.665002822875977, | |
| "learning_rate": 4.9190483115048375e-05, | |
| "loss": 3.9059, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 10.385906219482422, | |
| "learning_rate": 4.910758006384583e-05, | |
| "loss": 3.9221, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.108, | |
| "grad_norm": 8.544922828674316, | |
| "learning_rate": 4.9020713180604126e-05, | |
| "loss": 3.9398, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 7.989080429077148, | |
| "learning_rate": 4.892989674820585e-05, | |
| "loss": 3.7757, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.116, | |
| "grad_norm": 6.576107025146484, | |
| "learning_rate": 4.8835145698928856e-05, | |
| "loss": 3.5309, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 9.80089282989502, | |
| "learning_rate": 4.873647561199115e-05, | |
| "loss": 4.1776, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.124, | |
| "grad_norm": 15.050427436828613, | |
| "learning_rate": 4.863390271098922e-05, | |
| "loss": 3.5808, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 8.734102249145508, | |
| "learning_rate": 4.852744386123061e-05, | |
| "loss": 3.9796, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.132, | |
| "grad_norm": 8.711186408996582, | |
| "learning_rate": 4.84171165669608e-05, | |
| "loss": 4.2317, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 6.751059055328369, | |
| "learning_rate": 4.8302938968485144e-05, | |
| "loss": 3.7145, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 10.623860359191895, | |
| "learning_rate": 4.8184929839186196e-05, | |
| "loss": 3.9616, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 7.5071330070495605, | |
| "learning_rate": 4.806310858243694e-05, | |
| "loss": 4.0164, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.148, | |
| "grad_norm": 8.607765197753906, | |
| "learning_rate": 4.793749522841042e-05, | |
| "loss": 4.4924, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 8.406026840209961, | |
| "learning_rate": 4.780811043078636e-05, | |
| "loss": 3.4254, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.156, | |
| "grad_norm": 9.387131690979004, | |
| "learning_rate": 4.767497546335519e-05, | |
| "loss": 3.9158, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 7.5071258544921875, | |
| "learning_rate": 4.753811221652017e-05, | |
| "loss": 4.1042, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.164, | |
| "grad_norm": 6.716228008270264, | |
| "learning_rate": 4.739754319369814e-05, | |
| "loss": 3.8632, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 9.47385311126709, | |
| "learning_rate": 4.7253291507619404e-05, | |
| "loss": 3.7837, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.172, | |
| "grad_norm": 7.547070026397705, | |
| "learning_rate": 4.710538087652748e-05, | |
| "loss": 4.0398, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 13.61339282989502, | |
| "learning_rate": 4.695383562027933e-05, | |
| "loss": 3.7789, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.434921741485596, | |
| "learning_rate": 4.679868065634656e-05, | |
| "loss": 3.9506, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 7.865591049194336, | |
| "learning_rate": 4.663994149571849e-05, | |
| "loss": 3.4036, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.188, | |
| "grad_norm": 9.311790466308594, | |
| "learning_rate": 4.647764423870751e-05, | |
| "loss": 4.1299, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 8.277907371520996, | |
| "learning_rate": 4.631181557065761e-05, | |
| "loss": 4.0614, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.196, | |
| "grad_norm": 12.398967742919922, | |
| "learning_rate": 4.614248275755676e-05, | |
| "loss": 3.7492, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 7.308017730712891, | |
| "learning_rate": 4.5969673641553685e-05, | |
| "loss": 4.1606, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.204, | |
| "grad_norm": 6.510436058044434, | |
| "learning_rate": 4.579341663638004e-05, | |
| "loss": 3.5708, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 10.914970397949219, | |
| "learning_rate": 4.5613740722678525e-05, | |
| "loss": 3.4741, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.212, | |
| "grad_norm": 8.786978721618652, | |
| "learning_rate": 4.5430675443237817e-05, | |
| "loss": 3.6204, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 10.566540718078613, | |
| "learning_rate": 4.524425089813507e-05, | |
| "loss": 3.9298, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 8.051084518432617, | |
| "learning_rate": 4.505449773978677e-05, | |
| "loss": 3.7783, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 11.182727813720703, | |
| "learning_rate": 4.4861447167908824e-05, | |
| "loss": 3.8174, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.228, | |
| "grad_norm": 11.375614166259766, | |
| "learning_rate": 4.466513092438653e-05, | |
| "loss": 4.0511, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 8.66441535949707, | |
| "learning_rate": 4.446558128805561e-05, | |
| "loss": 3.7058, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.236, | |
| "grad_norm": 5.743879795074463, | |
| "learning_rate": 4.426283106939474e-05, | |
| "loss": 3.817, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 10.231585502624512, | |
| "learning_rate": 4.4056913605130804e-05, | |
| "loss": 3.9779, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.244, | |
| "grad_norm": 5.9834465980529785, | |
| "learning_rate": 4.3847862752757604e-05, | |
| "loss": 3.4466, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 9.162353515625, | |
| "learning_rate": 4.363571288496888e-05, | |
| "loss": 3.576, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.252, | |
| "grad_norm": 10.162070274353027, | |
| "learning_rate": 4.342049888400669e-05, | |
| "loss": 4.084, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 12.520784378051758, | |
| "learning_rate": 4.3202256135925956e-05, | |
| "loss": 3.6774, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 10.249221801757812, | |
| "learning_rate": 4.298102052477621e-05, | |
| "loss": 3.9724, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 10.406034469604492, | |
| "learning_rate": 4.2756828426701426e-05, | |
| "loss": 3.9906, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.268, | |
| "grad_norm": 15.699187278747559, | |
| "learning_rate": 4.2529716703959024e-05, | |
| "loss": 3.2696, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 10.240876197814941, | |
| "learning_rate": 4.229972269885877e-05, | |
| "loss": 3.2456, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.276, | |
| "grad_norm": 11.525603294372559, | |
| "learning_rate": 4.206688422762295e-05, | |
| "loss": 3.6349, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 6.479814052581787, | |
| "learning_rate": 4.1831239574168493e-05, | |
| "loss": 3.5024, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.284, | |
| "grad_norm": 10.296248435974121, | |
| "learning_rate": 4.159282748381218e-05, | |
| "loss": 4.0567, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 13.293269157409668, | |
| "learning_rate": 4.135168715690015e-05, | |
| "loss": 3.9591, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.292, | |
| "grad_norm": 7.214468479156494, | |
| "learning_rate": 4.110785824236236e-05, | |
| "loss": 3.8723, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 8.655447006225586, | |
| "learning_rate": 4.086138083119347e-05, | |
| "loss": 3.7503, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 12.488017082214355, | |
| "learning_rate": 4.061229544986095e-05, | |
| "loss": 3.6059, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 10.392841339111328, | |
| "learning_rate": 4.036064305364162e-05, | |
| "loss": 3.7607, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.308, | |
| "grad_norm": 13.796865463256836, | |
| "learning_rate": 4.010646501988769e-05, | |
| "loss": 3.3188, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 6.374794006347656, | |
| "learning_rate": 3.9849803141223324e-05, | |
| "loss": 3.3962, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.316, | |
| "grad_norm": 9.044532775878906, | |
| "learning_rate": 3.9590699618673086e-05, | |
| "loss": 3.9154, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 8.740546226501465, | |
| "learning_rate": 3.932919705472306e-05, | |
| "loss": 3.4457, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.324, | |
| "grad_norm": 10.944662094116211, | |
| "learning_rate": 3.906533844631604e-05, | |
| "loss": 3.6514, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 8.953042984008789, | |
| "learning_rate": 3.879916717778191e-05, | |
| "loss": 3.705, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.332, | |
| "grad_norm": 10.540362358093262, | |
| "learning_rate": 3.8530727013704215e-05, | |
| "loss": 3.4666, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 8.946858406066895, | |
| "learning_rate": 3.826006209172433e-05, | |
| "loss": 3.8688, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 8.028446197509766, | |
| "learning_rate": 3.7987216915284184e-05, | |
| "loss": 3.6068, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 13.014655113220215, | |
| "learning_rate": 3.771223634630892e-05, | |
| "loss": 3.883, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.348, | |
| "grad_norm": 7.317591667175293, | |
| "learning_rate": 3.743516559783055e-05, | |
| "loss": 3.8452, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 9.003655433654785, | |
| "learning_rate": 3.7156050226553956e-05, | |
| "loss": 3.5083, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.356, | |
| "grad_norm": 8.790939331054688, | |
| "learning_rate": 3.687493612536628e-05, | |
| "loss": 3.6303, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 9.35024642944336, | |
| "learning_rate": 3.659186951579111e-05, | |
| "loss": 3.3183, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.364, | |
| "grad_norm": 12.418292045593262, | |
| "learning_rate": 3.630689694038866e-05, | |
| "loss": 3.6162, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 9.97085952758789, | |
| "learning_rate": 3.6020065255103056e-05, | |
| "loss": 3.6587, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.372, | |
| "grad_norm": 11.682862281799316, | |
| "learning_rate": 3.573142162155819e-05, | |
| "loss": 3.679, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 10.91349983215332, | |
| "learning_rate": 3.544101349930328e-05, | |
| "loss": 3.5703, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 7.593992710113525, | |
| "learning_rate": 3.514888863800944e-05, | |
| "loss": 3.0866, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 7.078611850738525, | |
| "learning_rate": 3.485509506961856e-05, | |
| "loss": 3.5236, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.388, | |
| "grad_norm": 4.66752290725708, | |
| "learning_rate": 3.4559681100445756e-05, | |
| "loss": 3.0979, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 11.089188575744629, | |
| "learning_rate": 3.4262695303236724e-05, | |
| "loss": 3.5252, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.396, | |
| "grad_norm": 9.009184837341309, | |
| "learning_rate": 3.396418650918127e-05, | |
| "loss": 3.7062, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 7.165460109710693, | |
| "learning_rate": 3.366420379988441e-05, | |
| "loss": 3.4182, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.404, | |
| "grad_norm": 13.719085693359375, | |
| "learning_rate": 3.336279649929614e-05, | |
| "loss": 3.6603, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 10.594961166381836, | |
| "learning_rate": 3.306001416560156e-05, | |
| "loss": 3.824, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.412, | |
| "grad_norm": 9.565075874328613, | |
| "learning_rate": 3.275590658307234e-05, | |
| "loss": 3.074, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 11.031000137329102, | |
| "learning_rate": 3.245052375388107e-05, | |
| "loss": 3.3561, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 8.683501243591309, | |
| "learning_rate": 3.214391588987976e-05, | |
| "loss": 3.4976, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 7.569673538208008, | |
| "learning_rate": 3.1836133404343885e-05, | |
| "loss": 3.3982, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.428, | |
| "grad_norm": 9.724939346313477, | |
| "learning_rate": 3.1527226903683286e-05, | |
| "loss": 3.1605, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 11.795547485351562, | |
| "learning_rate": 3.121724717912138e-05, | |
| "loss": 3.4858, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.436, | |
| "grad_norm": 10.01028823852539, | |
| "learning_rate": 3.090624519834383e-05, | |
| "loss": 3.5917, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 10.159195899963379, | |
| "learning_rate": 3.0594272097118436e-05, | |
| "loss": 3.5127, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.444, | |
| "grad_norm": 12.02109432220459, | |
| "learning_rate": 3.028137917088716e-05, | |
| "loss": 3.7095, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 11.922860145568848, | |
| "learning_rate": 2.9967617866331997e-05, | |
| "loss": 3.0155, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.452, | |
| "grad_norm": 7.406614780426025, | |
| "learning_rate": 2.9653039772916052e-05, | |
| "loss": 3.5601, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 9.041807174682617, | |
| "learning_rate": 2.9337696614400977e-05, | |
| "loss": 3.4362, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 7.662649631500244, | |
| "learning_rate": 2.902164024034246e-05, | |
| "loss": 3.2583, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 10.045381546020508, | |
| "learning_rate": 2.8704922617564983e-05, | |
| "loss": 3.5937, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.468, | |
| "grad_norm": 10.429932594299316, | |
| "learning_rate": 2.8387595821617275e-05, | |
| "loss": 3.5703, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 13.951080322265625, | |
| "learning_rate": 2.8069712028209927e-05, | |
| "loss": 3.2037, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.476, | |
| "grad_norm": 9.768102645874023, | |
| "learning_rate": 2.7751323504636544e-05, | |
| "loss": 3.2948, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 16.445524215698242, | |
| "learning_rate": 2.7432482601179794e-05, | |
| "loss": 3.7049, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.484, | |
| "grad_norm": 10.077542304992676, | |
| "learning_rate": 2.711324174250382e-05, | |
| "loss": 3.7272, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 10.981230735778809, | |
| "learning_rate": 2.6793653419034482e-05, | |
| "loss": 3.3686, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.492, | |
| "grad_norm": 8.846978187561035, | |
| "learning_rate": 2.6473770178328715e-05, | |
| "loss": 3.7523, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 13.945764541625977, | |
| "learning_rate": 2.6153644616434526e-05, | |
| "loss": 3.5152, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 10.375041961669922, | |
| "learning_rate": 2.583332936924299e-05, | |
| "loss": 3.4198, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": 7.862137794494629, | |
| "learning_rate": 2.5512877103833783e-05, | |
| "loss": 3.4253, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.508, | |
| "grad_norm": 9.651905059814453, | |
| "learning_rate": 2.519234050981543e-05, | |
| "loss": 2.9916, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 8.323561668395996, | |
| "learning_rate": 2.4871772290662044e-05, | |
| "loss": 3.0336, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.516, | |
| "grad_norm": 7.276916980743408, | |
| "learning_rate": 2.4551225155047573e-05, | |
| "loss": 3.3251, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 9.36464786529541, | |
| "learning_rate": 2.423075180817938e-05, | |
| "loss": 3.0858, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.524, | |
| "grad_norm": 9.869660377502441, | |
| "learning_rate": 2.391040494313229e-05, | |
| "loss": 3.2847, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 8.658061981201172, | |
| "learning_rate": 2.3590237232184644e-05, | |
| "loss": 3.1331, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.532, | |
| "grad_norm": 8.946754455566406, | |
| "learning_rate": 2.3270301318157792e-05, | |
| "loss": 3.4923, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": 10.488960266113281, | |
| "learning_rate": 2.2950649805760438e-05, | |
| "loss": 3.2958, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 12.32264518737793, | |
| "learning_rate": 2.263133525293918e-05, | |
| "loss": 2.9298, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 14.110706329345703, | |
| "learning_rate": 2.2312410162236883e-05, | |
| "loss": 3.2753, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.548, | |
| "grad_norm": 11.187686920166016, | |
| "learning_rate": 2.1993926972159972e-05, | |
| "loss": 3.4152, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": 10.895075798034668, | |
| "learning_rate": 2.1675938048556446e-05, | |
| "loss": 3.4019, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.556, | |
| "grad_norm": 5.504537105560303, | |
| "learning_rate": 2.1358495676005664e-05, | |
| "loss": 3.167, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 8.452468872070312, | |
| "learning_rate": 2.1041652049221648e-05, | |
| "loss": 3.0729, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.564, | |
| "grad_norm": 11.04509449005127, | |
| "learning_rate": 2.0725459264471047e-05, | |
| "loss": 3.642, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.568, | |
| "grad_norm": 8.009263038635254, | |
| "learning_rate": 2.0409969311007335e-05, | |
| "loss": 3.1349, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.572, | |
| "grad_norm": 8.250015258789062, | |
| "learning_rate": 2.009523406252263e-05, | |
| "loss": 3.4037, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 6.933814525604248, | |
| "learning_rate": 1.9781305268618417e-05, | |
| "loss": 3.2761, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 8.798672676086426, | |
| "learning_rate": 1.9468234546296844e-05, | |
| "loss": 3.2963, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.584, | |
| "grad_norm": 8.615999221801758, | |
| "learning_rate": 1.9156073371473618e-05, | |
| "loss": 3.3487, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.588, | |
| "grad_norm": 6.798926830291748, | |
| "learning_rate": 1.8844873070514272e-05, | |
| "loss": 3.2746, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 8.364091873168945, | |
| "learning_rate": 1.8534684811794893e-05, | |
| "loss": 3.071, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.596, | |
| "grad_norm": 6.177745342254639, | |
| "learning_rate": 1.822555959728892e-05, | |
| "loss": 2.8733, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 7.9252238273620605, | |
| "learning_rate": 1.7917548254181273e-05, | |
| "loss": 3.0836, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.604, | |
| "grad_norm": 10.669748306274414, | |
| "learning_rate": 1.7610701426511128e-05, | |
| "loss": 3.587, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 6.509505271911621, | |
| "learning_rate": 1.7305069566845046e-05, | |
| "loss": 2.8579, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.612, | |
| "grad_norm": 8.801206588745117, | |
| "learning_rate": 1.7000702927981254e-05, | |
| "loss": 3.5055, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.616, | |
| "grad_norm": 13.360625267028809, | |
| "learning_rate": 1.669765155468708e-05, | |
| "loss": 3.007, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 9.038350105285645, | |
| "learning_rate": 1.6395965275470393e-05, | |
| "loss": 3.546, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 12.468111038208008, | |
| "learning_rate": 1.6095693694386697e-05, | |
| "loss": 3.046, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.628, | |
| "grad_norm": 9.378480911254883, | |
| "learning_rate": 1.5796886182883053e-05, | |
| "loss": 2.9804, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.632, | |
| "grad_norm": 8.186980247497559, | |
| "learning_rate": 1.549959187168038e-05, | |
| "loss": 3.1672, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.636, | |
| "grad_norm": 13.096222877502441, | |
| "learning_rate": 1.520385964269519e-05, | |
| "loss": 3.0177, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 9.109463691711426, | |
| "learning_rate": 1.4909738121002276e-05, | |
| "loss": 3.101, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.644, | |
| "grad_norm": 8.423794746398926, | |
| "learning_rate": 1.4617275666839725e-05, | |
| "loss": 2.8508, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "grad_norm": 6.3356122970581055, | |
| "learning_rate": 1.4326520367657314e-05, | |
| "loss": 3.3239, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.652, | |
| "grad_norm": 8.81240177154541, | |
| "learning_rate": 1.4037520030209934e-05, | |
| "loss": 3.2261, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 7.250948905944824, | |
| "learning_rate": 1.3750322172696972e-05, | |
| "loss": 3.1138, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 9.962249755859375, | |
| "learning_rate": 1.3464974016949342e-05, | |
| "loss": 3.2969, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.664, | |
| "grad_norm": 7.954286575317383, | |
| "learning_rate": 1.3181522480665098e-05, | |
| "loss": 2.7313, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.668, | |
| "grad_norm": 11.184345245361328, | |
| "learning_rate": 1.2900014169695082e-05, | |
| "loss": 3.2666, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 8.145426750183105, | |
| "learning_rate": 1.262049537037992e-05, | |
| "loss": 2.7759, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.676, | |
| "grad_norm": 12.047683715820312, | |
| "learning_rate": 1.2343012041939469e-05, | |
| "loss": 2.9462, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 11.436731338500977, | |
| "learning_rate": 1.2067609808916086e-05, | |
| "loss": 3.362, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.684, | |
| "grad_norm": 7.729074954986572, | |
| "learning_rate": 1.1794333953672893e-05, | |
| "loss": 3.4444, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 11.743609428405762, | |
| "learning_rate": 1.1523229408948394e-05, | |
| "loss": 2.9723, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.692, | |
| "grad_norm": 7.753131866455078, | |
| "learning_rate": 1.1254340750468445e-05, | |
| "loss": 3.0701, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.696, | |
| "grad_norm": 10.646190643310547, | |
| "learning_rate": 1.0987712189617049e-05, | |
| "loss": 3.3374, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 8.74120044708252, | |
| "learning_rate": 1.0723387566166979e-05, | |
| "loss": 3.0917, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 9.45445728302002, | |
| "learning_rate": 1.0461410341071528e-05, | |
| "loss": 3.2809, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.708, | |
| "grad_norm": 11.984269142150879, | |
| "learning_rate": 1.0201823589318554e-05, | |
| "loss": 3.256, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.712, | |
| "grad_norm": 6.780118465423584, | |
| "learning_rate": 9.944669992847946e-06, | |
| "loss": 3.0955, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.716, | |
| "grad_norm": 10.487933158874512, | |
| "learning_rate": 9.689991833533804e-06, | |
| "loss": 3.1214, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 7.70168399810791, | |
| "learning_rate": 9.437830986232265e-06, | |
| "loss": 3.052, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.724, | |
| "grad_norm": 13.816009521484375, | |
| "learning_rate": 9.188228911896412e-06, | |
| "loss": 3.4094, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.728, | |
| "grad_norm": 8.344259262084961, | |
| "learning_rate": 8.94122665075909e-06, | |
| "loss": 3.0472, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.732, | |
| "grad_norm": 19.413257598876953, | |
| "learning_rate": 8.696864815584995e-06, | |
| "loss": 2.6052, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 10.31498908996582, | |
| "learning_rate": 8.455183584993009e-06, | |
| "loss": 3.0981, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 11.46462345123291, | |
| "learning_rate": 8.2162226968499e-06, | |
| "loss": 3.1952, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.744, | |
| "grad_norm": 9.817370414733887, | |
| "learning_rate": 7.980021441736576e-06, | |
| "loss": 2.9148, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.748, | |
| "grad_norm": 12.085224151611328, | |
| "learning_rate": 7.746618656487748e-06, | |
| "loss": 3.1418, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 13.42601490020752, | |
| "learning_rate": 7.516052717806346e-06, | |
| "loss": 3.0495, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.756, | |
| "grad_norm": 10.328361511230469, | |
| "learning_rate": 7.288361535953472e-06, | |
| "loss": 3.2537, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 8.320837020874023, | |
| "learning_rate": 7.06358254851513e-06, | |
| "loss": 3.2002, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.764, | |
| "grad_norm": 12.367525100708008, | |
| "learning_rate": 6.841752714246588e-06, | |
| "loss": 3.415, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 8.72415828704834, | |
| "learning_rate": 6.622908506995581e-06, | |
| "loss": 2.7481, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.772, | |
| "grad_norm": 9.888436317443848, | |
| "learning_rate": 6.407085909705157e-06, | |
| "loss": 3.4815, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.776, | |
| "grad_norm": 7.541075706481934, | |
| "learning_rate": 6.194320408497245e-06, | |
| "loss": 3.4048, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 11.171248435974121, | |
| "learning_rate": 5.98464698683798e-06, | |
| "loss": 3.5409, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 9.28205394744873, | |
| "learning_rate": 5.778100119785587e-06, | |
| "loss": 3.1082, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.788, | |
| "grad_norm": 8.433388710021973, | |
| "learning_rate": 5.5747137683219404e-06, | |
| "loss": 2.9565, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "grad_norm": 14.938470840454102, | |
| "learning_rate": 5.374521373768549e-06, | |
| "loss": 3.2282, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.796, | |
| "grad_norm": 9.903738975524902, | |
| "learning_rate": 5.177555852288119e-06, | |
| "loss": 2.9652, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 13.002461433410645, | |
| "learning_rate": 4.983849589472348e-06, | |
| "loss": 3.221, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.804, | |
| "grad_norm": 12.107378005981445, | |
| "learning_rate": 4.793434435016986e-06, | |
| "loss": 3.1341, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.808, | |
| "grad_norm": 11.94257640838623, | |
| "learning_rate": 4.606341697485087e-06, | |
| "loss": 3.318, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.812, | |
| "grad_norm": 10.116772651672363, | |
| "learning_rate": 4.422602139159091e-06, | |
| "loss": 3.2286, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 10.068933486938477, | |
| "learning_rate": 4.242245970982883e-06, | |
| "loss": 3.306, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 10.280326843261719, | |
| "learning_rate": 4.065302847594369e-06, | |
| "loss": 3.005, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.824, | |
| "grad_norm": 10.214073181152344, | |
| "learning_rate": 3.891801862449629e-06, | |
| "loss": 2.9953, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.828, | |
| "grad_norm": 12.787151336669922, | |
| "learning_rate": 3.721771543039254e-06, | |
| "loss": 2.9877, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 7.119079113006592, | |
| "learning_rate": 3.5552398461978277e-06, | |
| "loss": 3.0851, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.836, | |
| "grad_norm": 6.1061177253723145, | |
| "learning_rate": 3.3922341535071483e-06, | |
| "loss": 2.9198, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 9.866963386535645, | |
| "learning_rate": 3.23278126679408e-06, | |
| "loss": 2.9846, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.844, | |
| "grad_norm": 9.084943771362305, | |
| "learning_rate": 3.0769074037237583e-06, | |
| "loss": 2.9903, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 6.5540595054626465, | |
| "learning_rate": 2.9246381934887684e-06, | |
| "loss": 3.2851, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.852, | |
| "grad_norm": 7.740701675415039, | |
| "learning_rate": 2.7759986725951703e-06, | |
| "loss": 2.9797, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.856, | |
| "grad_norm": 10.074856758117676, | |
| "learning_rate": 2.6310132807458894e-06, | |
| "loss": 3.1325, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 10.44127368927002, | |
| "learning_rate": 2.4897058568223137e-06, | |
| "loss": 3.0159, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 9.894632339477539, | |
| "learning_rate": 2.3520996349645995e-06, | |
| "loss": 2.8015, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.868, | |
| "grad_norm": 9.043245315551758, | |
| "learning_rate": 2.218217240751491e-06, | |
| "loss": 3.4477, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.872, | |
| "grad_norm": 9.901315689086914, | |
| "learning_rate": 2.088080687480151e-06, | |
| "loss": 3.3157, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.876, | |
| "grad_norm": 8.202696800231934, | |
| "learning_rate": 1.961711372546657e-06, | |
| "loss": 2.9467, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 8.691917419433594, | |
| "learning_rate": 1.8391300739278139e-06, | |
| "loss": 2.9079, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.884, | |
| "grad_norm": 13.363630294799805, | |
| "learning_rate": 1.7203569467647674e-06, | |
| "loss": 3.2583, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.888, | |
| "grad_norm": 11.732659339904785, | |
| "learning_rate": 1.6054115200490493e-06, | |
| "loss": 3.0431, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.892, | |
| "grad_norm": 7.8193230628967285, | |
| "learning_rate": 1.4943126934115536e-06, | |
| "loss": 3.1155, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 6.232199192047119, | |
| "learning_rate": 1.3870787340150376e-06, | |
| "loss": 3.2006, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 5.650846004486084, | |
| "learning_rate": 1.2837272735505668e-06, | |
| "loss": 2.8882, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.904, | |
| "grad_norm": 7.191598892211914, | |
| "learning_rate": 1.1842753053384559e-06, | |
| "loss": 3.0833, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.908, | |
| "grad_norm": 8.854833602905273, | |
| "learning_rate": 1.0887391815342124e-06, | |
| "loss": 3.3196, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 13.160386085510254, | |
| "learning_rate": 9.971346104398455e-07, | |
| "loss": 3.564, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.916, | |
| "grad_norm": 8.540671348571777, | |
| "learning_rate": 9.09476653921082e-07, | |
| "loss": 3.1383, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 12.331473350524902, | |
| "learning_rate": 8.257797249308419e-07, | |
| "loss": 3.259, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.924, | |
| "grad_norm": 7.576813697814941, | |
| "learning_rate": 7.460575851394341e-07, | |
| "loss": 2.8659, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 6.937955379486084, | |
| "learning_rate": 6.703233426718136e-07, | |
| "loss": 2.9416, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.932, | |
| "grad_norm": 7.9867777824401855, | |
| "learning_rate": 5.985894499523193e-07, | |
| "loss": 3.0008, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.936, | |
| "grad_norm": 10.474209785461426, | |
| "learning_rate": 5.308677016572145e-07, | |
| "loss": 3.6042, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 6.954331398010254, | |
| "learning_rate": 4.6716923277536627e-07, | |
| "loss": 2.696, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 6.555063247680664, | |
| "learning_rate": 4.075045167774072e-07, | |
| "loss": 3.2311, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.948, | |
| "grad_norm": 7.122920513153076, | |
| "learning_rate": 3.518833638936514e-07, | |
| "loss": 3.1349, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.952, | |
| "grad_norm": 10.269899368286133, | |
| "learning_rate": 3.003149195010907e-07, | |
| "loss": 2.9381, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.956, | |
| "grad_norm": 8.958882331848145, | |
| "learning_rate": 2.528076626196585e-07, | |
| "loss": 3.0804, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 11.036646842956543, | |
| "learning_rate": 2.0936940451811437e-07, | |
| "loss": 3.0191, | |
| "step": 2400 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 300, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.128334475132928e+20, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |