| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 100.0, | |
| "eval_steps": 100, | |
| "global_step": 3200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 22.61836051940918, | |
| "learning_rate": 5.193376768186383e-06, | |
| "loss": 28.116, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 12.9418363571167, | |
| "learning_rate": 6.666429955495757e-06, | |
| "loss": 26.7925, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 10.848026275634766, | |
| "learning_rate": 7.528110831639518e-06, | |
| "loss": 25.6575, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.256, | |
| "grad_norm": 10.735931396484375, | |
| "learning_rate": 8.085678584707068e-06, | |
| "loss": 23.0539, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.576, | |
| "grad_norm": 10.395103454589844, | |
| "learning_rate": 8.57076623169122e-06, | |
| "loss": 24.4311, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.896, | |
| "grad_norm": 10.887977600097656, | |
| "learning_rate": 8.965446093665792e-06, | |
| "loss": 24.2035, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 2.192, | |
| "grad_norm": 10.915943145751953, | |
| "learning_rate": 9.267156540931188e-06, | |
| "loss": 21.8915, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 2.512, | |
| "grad_norm": 10.198206901550293, | |
| "learning_rate": 9.558731772016444e-06, | |
| "loss": 23.4517, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 2.832, | |
| "grad_norm": 10.81621265411377, | |
| "learning_rate": 9.815086334040616e-06, | |
| "loss": 23.2951, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 3.128, | |
| "grad_norm": 12.31503677368164, | |
| "learning_rate": 9.999997695170515e-06, | |
| "loss": 21.3033, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 3.448, | |
| "grad_norm": 12.238049507141113, | |
| "learning_rate": 9.999721118489014e-06, | |
| "loss": 22.7153, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 3.768, | |
| "grad_norm": 10.42602252960205, | |
| "learning_rate": 9.998983608373495e-06, | |
| "loss": 22.6426, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 4.064, | |
| "grad_norm": 11.45156192779541, | |
| "learning_rate": 9.997785240371542e-06, | |
| "loss": 20.7914, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 4.384, | |
| "grad_norm": 12.555192947387695, | |
| "learning_rate": 9.996126137239184e-06, | |
| "loss": 22.1085, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 4.704, | |
| "grad_norm": 11.370450019836426, | |
| "learning_rate": 9.994006468928306e-06, | |
| "loss": 22.0096, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 12.388717651367188, | |
| "learning_rate": 9.991705162704631e-06, | |
| "loss": 20.3489, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 5.32, | |
| "grad_norm": 11.950061798095703, | |
| "learning_rate": 9.988711057723813e-06, | |
| "loss": 21.4641, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 5.64, | |
| "grad_norm": 12.537701606750488, | |
| "learning_rate": 9.985257147135564e-06, | |
| "loss": 21.4924, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 5.96, | |
| "grad_norm": 11.869683265686035, | |
| "learning_rate": 9.98134378474467e-06, | |
| "loss": 21.3469, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 6.256, | |
| "grad_norm": 13.779632568359375, | |
| "learning_rate": 9.97742925764817e-06, | |
| "loss": 19.3486, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 6.576, | |
| "grad_norm": 13.29927921295166, | |
| "learning_rate": 9.97264408009259e-06, | |
| "loss": 20.8448, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 6.896, | |
| "grad_norm": 13.365044593811035, | |
| "learning_rate": 9.967400742766507e-06, | |
| "loss": 20.8125, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 7.192, | |
| "grad_norm": 13.839238166809082, | |
| "learning_rate": 9.962290455518914e-06, | |
| "loss": 19.0171, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 7.5120000000000005, | |
| "grad_norm": 15.155104637145996, | |
| "learning_rate": 9.956178133102374e-06, | |
| "loss": 20.2994, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 7.832, | |
| "grad_norm": 15.683795928955078, | |
| "learning_rate": 9.949609337621082e-06, | |
| "loss": 20.2594, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 8.128, | |
| "grad_norm": 15.537983894348145, | |
| "learning_rate": 9.94258474195617e-06, | |
| "loss": 18.5829, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 8.448, | |
| "grad_norm": 16.607322692871094, | |
| "learning_rate": 9.935105065679127e-06, | |
| "loss": 19.7508, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 8.768, | |
| "grad_norm": 15.696822166442871, | |
| "learning_rate": 9.927171074978072e-06, | |
| "loss": 19.5839, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 9.064, | |
| "grad_norm": 16.520605087280273, | |
| "learning_rate": 9.91964271527259e-06, | |
| "loss": 18.1771, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 9.384, | |
| "grad_norm": 17.410171508789062, | |
| "learning_rate": 9.910847804623571e-06, | |
| "loss": 19.0556, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 9.704, | |
| "grad_norm": 17.677932739257812, | |
| "learning_rate": 9.901601064367343e-06, | |
| "loss": 19.0936, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 17.897781372070312, | |
| "learning_rate": 9.892893465727831e-06, | |
| "loss": 17.7231, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 10.32, | |
| "grad_norm": 19.219694137573242, | |
| "learning_rate": 9.88279089692669e-06, | |
| "loss": 18.4743, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 10.64, | |
| "grad_norm": 18.7277774810791, | |
| "learning_rate": 9.872239372555743e-06, | |
| "loss": 18.4844, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 10.96, | |
| "grad_norm": 21.37215232849121, | |
| "learning_rate": 9.86123997347095e-06, | |
| "loss": 18.4988, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 11.256, | |
| "grad_norm": 23.783323287963867, | |
| "learning_rate": 9.850958511714194e-06, | |
| "loss": 16.5981, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 11.576, | |
| "grad_norm": 23.457868576049805, | |
| "learning_rate": 9.839111292652133e-06, | |
| "loss": 17.8158, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 11.896, | |
| "grad_norm": 23.80459213256836, | |
| "learning_rate": 9.826819592384226e-06, | |
| "loss": 17.8393, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 12.192, | |
| "grad_norm": 24.82889747619629, | |
| "learning_rate": 9.814084670023088e-06, | |
| "loss": 16.1185, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 12.512, | |
| "grad_norm": 28.92336082458496, | |
| "learning_rate": 9.800907830083227e-06, | |
| "loss": 17.25, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 12.832, | |
| "grad_norm": 27.11651611328125, | |
| "learning_rate": 9.787290422347427e-06, | |
| "loss": 17.2422, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 13.128, | |
| "grad_norm": 30.726104736328125, | |
| "learning_rate": 9.774659221884884e-06, | |
| "loss": 15.6421, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 13.448, | |
| "grad_norm": 27.514354705810547, | |
| "learning_rate": 9.760208615506392e-06, | |
| "loss": 16.4979, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 13.768, | |
| "grad_norm": 31.63400650024414, | |
| "learning_rate": 9.745321610396716e-06, | |
| "loss": 16.5669, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 14.064, | |
| "grad_norm": 31.240859985351562, | |
| "learning_rate": 9.731551444345466e-06, | |
| "loss": 15.132, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 14.384, | |
| "grad_norm": 34.0392951965332, | |
| "learning_rate": 9.715839519747009e-06, | |
| "loss": 15.7479, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 14.704, | |
| "grad_norm": 31.50341033935547, | |
| "learning_rate": 9.69969574141008e-06, | |
| "loss": 15.8802, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 30.895503997802734, | |
| "learning_rate": 9.684798471835533e-06, | |
| "loss": 14.6161, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 15.32, | |
| "grad_norm": 34.22053146362305, | |
| "learning_rate": 9.667838763776003e-06, | |
| "loss": 15.0823, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 15.64, | |
| "grad_norm": 40.52189636230469, | |
| "learning_rate": 9.650452118983454e-06, | |
| "loss": 15.0497, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 15.96, | |
| "grad_norm": 37.59934616088867, | |
| "learning_rate": 9.632640318476236e-06, | |
| "loss": 15.1599, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 16.256, | |
| "grad_norm": 44.10334396362305, | |
| "learning_rate": 9.614405186823954e-06, | |
| "loss": 13.3485, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 16.576, | |
| "grad_norm": 39.68657302856445, | |
| "learning_rate": 9.595748591960559e-06, | |
| "loss": 14.352, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 16.896, | |
| "grad_norm": 35.99330520629883, | |
| "learning_rate": 9.576672444993012e-06, | |
| "loss": 14.4162, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 17.192, | |
| "grad_norm": 46.91948699951172, | |
| "learning_rate": 9.55914680983895e-06, | |
| "loss": 12.9685, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 17.512, | |
| "grad_norm": 46.87045669555664, | |
| "learning_rate": 9.539278932734926e-06, | |
| "loss": 13.6413, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 17.832, | |
| "grad_norm": 48.02497100830078, | |
| "learning_rate": 9.51899728805278e-06, | |
| "loss": 13.5406, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 18.128, | |
| "grad_norm": 44.37493896484375, | |
| "learning_rate": 9.500391752799266e-06, | |
| "loss": 12.2899, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 18.448, | |
| "grad_norm": 47.000877380371094, | |
| "learning_rate": 9.479329708285107e-06, | |
| "loss": 12.7494, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 18.768, | |
| "grad_norm": 47.82246780395508, | |
| "learning_rate": 9.4578600371516e-06, | |
| "loss": 13.0171, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 19.064, | |
| "grad_norm": 44.77831268310547, | |
| "learning_rate": 9.43819062920712e-06, | |
| "loss": 11.778, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 19.384, | |
| "grad_norm": 47.652130126953125, | |
| "learning_rate": 9.415952560807661e-06, | |
| "loss": 12.1909, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 19.704, | |
| "grad_norm": 45.41845703125, | |
| "learning_rate": 9.393313357890357e-06, | |
| "loss": 12.3073, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 55.55602264404297, | |
| "learning_rate": 9.370275339524604e-06, | |
| "loss": 11.2453, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 20.32, | |
| "grad_norm": 54.18324279785156, | |
| "learning_rate": 9.34684086563286e-06, | |
| "loss": 11.3862, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 20.64, | |
| "grad_norm": 52.06657028198242, | |
| "learning_rate": 9.3230123367489e-06, | |
| "loss": 11.2495, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 20.96, | |
| "grad_norm": 52.56513977050781, | |
| "learning_rate": 9.298792193771915e-06, | |
| "loss": 11.4311, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 21.256, | |
| "grad_norm": 55.839622497558594, | |
| "learning_rate": 9.276661284776841e-06, | |
| "loss": 9.9001, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 21.576, | |
| "grad_norm": 54.41539764404297, | |
| "learning_rate": 9.251703943168894e-06, | |
| "loss": 10.7236, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 21.896, | |
| "grad_norm": 54.44001007080078, | |
| "learning_rate": 9.226362292014552e-06, | |
| "loss": 10.793, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 22.192, | |
| "grad_norm": 56.16716003417969, | |
| "learning_rate": 9.203228366013518e-06, | |
| "loss": 9.4557, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 22.512, | |
| "grad_norm": 62.475990295410156, | |
| "learning_rate": 9.177163710741043e-06, | |
| "loss": 10.1059, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 22.832, | |
| "grad_norm": 60.76034164428711, | |
| "learning_rate": 9.150722381525448e-06, | |
| "loss": 10.0807, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 23.128, | |
| "grad_norm": 57.365806579589844, | |
| "learning_rate": 9.126605366828865e-06, | |
| "loss": 9.0266, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 23.448, | |
| "grad_norm": 65.74173736572266, | |
| "learning_rate": 9.099455850827047e-06, | |
| "loss": 9.4513, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 23.768, | |
| "grad_norm": 63.232688903808594, | |
| "learning_rate": 9.071937620964472e-06, | |
| "loss": 9.3833, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 24.064, | |
| "grad_norm": 55.957969665527344, | |
| "learning_rate": 9.044053496098546e-06, | |
| "loss": 8.5141, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 24.384, | |
| "grad_norm": 59.8948860168457, | |
| "learning_rate": 9.015806332567492e-06, | |
| "loss": 9.0655, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 24.704, | |
| "grad_norm": 62.30965042114258, | |
| "learning_rate": 8.987199023897762e-06, | |
| "loss": 8.6284, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "grad_norm": 65.89315795898438, | |
| "learning_rate": 8.961146943234696e-06, | |
| "loss": 8.191, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 25.32, | |
| "grad_norm": 57.95423126220703, | |
| "learning_rate": 8.931863462353107e-06, | |
| "loss": 8.1047, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 25.64, | |
| "grad_norm": 70.34273529052734, | |
| "learning_rate": 8.902228435104725e-06, | |
| "loss": 8.6509, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 25.96, | |
| "grad_norm": 53.95098114013672, | |
| "learning_rate": 8.872244897183212e-06, | |
| "loss": 8.2501, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 26.256, | |
| "grad_norm": 62.12706756591797, | |
| "learning_rate": 8.844964274202623e-06, | |
| "loss": 7.0761, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 26.576, | |
| "grad_norm": 61.21213150024414, | |
| "learning_rate": 8.814327056944412e-06, | |
| "loss": 7.9348, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 26.896, | |
| "grad_norm": 72.10478210449219, | |
| "learning_rate": 8.783350333278674e-06, | |
| "loss": 7.8212, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 27.192, | |
| "grad_norm": 69.69745635986328, | |
| "learning_rate": 8.755183625881033e-06, | |
| "loss": 6.836, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 27.512, | |
| "grad_norm": 65.09381103515625, | |
| "learning_rate": 8.723570610500632e-06, | |
| "loss": 7.0539, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 27.832, | |
| "grad_norm": 56.70985794067383, | |
| "learning_rate": 8.691627385440082e-06, | |
| "loss": 7.3318, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 28.128, | |
| "grad_norm": 54.06085205078125, | |
| "learning_rate": 8.659357222835593e-06, | |
| "loss": 6.5638, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 28.448, | |
| "grad_norm": 63.56743240356445, | |
| "learning_rate": 8.626763428313554e-06, | |
| "loss": 6.803, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 28.768, | |
| "grad_norm": 56.118526458740234, | |
| "learning_rate": 8.593849340651913e-06, | |
| "loss": 6.8041, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 29.064, | |
| "grad_norm": 56.52422332763672, | |
| "learning_rate": 8.56395559706677e-06, | |
| "loss": 6.3728, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 29.384, | |
| "grad_norm": 61.43674850463867, | |
| "learning_rate": 8.530442268002263e-06, | |
| "loss": 6.4516, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 29.704, | |
| "grad_norm": 54.28837203979492, | |
| "learning_rate": 8.496618512552564e-06, | |
| "loss": 6.2512, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "grad_norm": 61.01597213745117, | |
| "learning_rate": 8.465914581069757e-06, | |
| "loss": 6.0489, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 30.32, | |
| "grad_norm": 52.78199768066406, | |
| "learning_rate": 8.431510586925946e-06, | |
| "loss": 6.3215, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 30.64, | |
| "grad_norm": 66.41915130615234, | |
| "learning_rate": 8.39680630056266e-06, | |
| "loss": 6.177, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 30.96, | |
| "grad_norm": 73.75415802001953, | |
| "learning_rate": 8.361805276948188e-06, | |
| "loss": 6.2312, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 31.256, | |
| "grad_norm": 61.83228302001953, | |
| "learning_rate": 8.3300536080273e-06, | |
| "loss": 5.5059, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 31.576, | |
| "grad_norm": 56.67393493652344, | |
| "learning_rate": 8.294498686142642e-06, | |
| "loss": 5.6519, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 31.896, | |
| "grad_norm": 62.11791229248047, | |
| "learning_rate": 8.258657506989624e-06, | |
| "loss": 5.9147, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 32.192, | |
| "grad_norm": 58.08525085449219, | |
| "learning_rate": 8.222533741995281e-06, | |
| "loss": 5.0185, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 32.512, | |
| "grad_norm": 59.7594108581543, | |
| "learning_rate": 8.186131091533624e-06, | |
| "loss": 5.5372, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 32.832, | |
| "grad_norm": 57.91405487060547, | |
| "learning_rate": 8.149453284546562e-06, | |
| "loss": 5.7239, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 33.128, | |
| "grad_norm": 56.105133056640625, | |
| "learning_rate": 8.116211104118957e-06, | |
| "loss": 5.0643, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 33.448, | |
| "grad_norm": 57.08987045288086, | |
| "learning_rate": 8.079020873606434e-06, | |
| "loss": 5.1807, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 33.768, | |
| "grad_norm": 53.906761169433594, | |
| "learning_rate": 8.04156645851083e-06, | |
| "loss": 5.3584, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 34.064, | |
| "grad_norm": 50.738651275634766, | |
| "learning_rate": 8.00763477756982e-06, | |
| "loss": 4.6149, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 34.384, | |
| "grad_norm": 62.51714324951172, | |
| "learning_rate": 7.969689003871167e-06, | |
| "loss": 5.1416, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 34.704, | |
| "grad_norm": 64.98294830322266, | |
| "learning_rate": 7.931490245108871e-06, | |
| "loss": 5.001, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "grad_norm": 48.71992111206055, | |
| "learning_rate": 7.89689829349943e-06, | |
| "loss": 4.5248, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 35.32, | |
| "grad_norm": 49.821617126464844, | |
| "learning_rate": 7.858229664328653e-06, | |
| "loss": 4.6947, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 35.64, | |
| "grad_norm": 60.61543655395508, | |
| "learning_rate": 7.819319467543181e-06, | |
| "loss": 4.7697, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 35.96, | |
| "grad_norm": 55.65413284301758, | |
| "learning_rate": 7.780171688947942e-06, | |
| "loss": 4.7358, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 36.256, | |
| "grad_norm": 58.530906677246094, | |
| "learning_rate": 7.740790338684804e-06, | |
| "loss": 4.1803, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 36.576, | |
| "grad_norm": 59.444671630859375, | |
| "learning_rate": 7.701179450821773e-06, | |
| "loss": 4.4415, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 36.896, | |
| "grad_norm": 49.884071350097656, | |
| "learning_rate": 7.661343082939769e-06, | |
| "loss": 4.7063, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 37.192, | |
| "grad_norm": 52.532249450683594, | |
| "learning_rate": 7.625300938639757e-06, | |
| "loss": 4.1235, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 37.512, | |
| "grad_norm": 51.9701042175293, | |
| "learning_rate": 7.58504741974322e-06, | |
| "loss": 4.4617, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 37.832, | |
| "grad_norm": 49.44038391113281, | |
| "learning_rate": 7.544580316928487e-06, | |
| "loss": 4.4452, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 38.128, | |
| "grad_norm": 50.28534698486328, | |
| "learning_rate": 7.507980735780601e-06, | |
| "loss": 3.9453, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 38.448, | |
| "grad_norm": 46.916019439697266, | |
| "learning_rate": 7.467119261530757e-06, | |
| "loss": 4.2222, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 38.768, | |
| "grad_norm": 49.836448669433594, | |
| "learning_rate": 7.426056283451572e-06, | |
| "loss": 4.3332, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 39.064, | |
| "grad_norm": 44.598793029785156, | |
| "learning_rate": 7.388930793516118e-06, | |
| "loss": 3.9014, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 39.384, | |
| "grad_norm": 42.805973052978516, | |
| "learning_rate": 7.347496563305526e-06, | |
| "loss": 4.1539, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 39.704, | |
| "grad_norm": 48.368324279785156, | |
| "learning_rate": 7.3058730829363485e-06, | |
| "loss": 3.9781, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "grad_norm": 50.3718147277832, | |
| "learning_rate": 7.264064616151436e-06, | |
| "loss": 3.7175, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 40.32, | |
| "grad_norm": 52.81575012207031, | |
| "learning_rate": 7.222075445642904e-06, | |
| "loss": 3.8827, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 40.64, | |
| "grad_norm": 53.90591812133789, | |
| "learning_rate": 7.17990987261344e-06, | |
| "loss": 3.9467, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 40.96, | |
| "grad_norm": 46.753875732421875, | |
| "learning_rate": 7.1375722163356945e-06, | |
| "loss": 4.0143, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 41.256, | |
| "grad_norm": 43.555267333984375, | |
| "learning_rate": 7.099324778599362e-06, | |
| "loss": 3.396, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 41.576, | |
| "grad_norm": 43.159061431884766, | |
| "learning_rate": 7.056672126521037e-06, | |
| "loss": 3.888, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 41.896, | |
| "grad_norm": 48.97645950317383, | |
| "learning_rate": 7.0138600151762305e-06, | |
| "loss": 3.8203, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 42.192, | |
| "grad_norm": 50.10870361328125, | |
| "learning_rate": 6.975196401579741e-06, | |
| "loss": 3.5406, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 42.512, | |
| "grad_norm": 44.10620880126953, | |
| "learning_rate": 6.9320934128234985e-06, | |
| "loss": 3.4479, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 42.832, | |
| "grad_norm": 49.795814514160156, | |
| "learning_rate": 6.8888437261459315e-06, | |
| "loss": 3.7759, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 43.128, | |
| "grad_norm": 44.46809005737305, | |
| "learning_rate": 6.849797242773753e-06, | |
| "loss": 3.2658, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 43.448, | |
| "grad_norm": 45.29841232299805, | |
| "learning_rate": 6.80628104764508e-06, | |
| "loss": 3.4485, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 43.768, | |
| "grad_norm": 43.17744445800781, | |
| "learning_rate": 6.762631042310571e-06, | |
| "loss": 3.6471, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 44.064, | |
| "grad_norm": 43.51730728149414, | |
| "learning_rate": 6.718851698102184e-06, | |
| "loss": 3.2223, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 44.384, | |
| "grad_norm": 44.667606353759766, | |
| "learning_rate": 6.674947499600837e-06, | |
| "loss": 3.4478, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 44.704, | |
| "grad_norm": 43.00602340698242, | |
| "learning_rate": 6.630922944177019e-06, | |
| "loss": 3.2787, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "grad_norm": 41.20823669433594, | |
| "learning_rate": 6.591201666146107e-06, | |
| "loss": 3.2353, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 45.32, | |
| "grad_norm": 44.2363395690918, | |
| "learning_rate": 6.546960866608958e-06, | |
| "loss": 3.3232, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 45.64, | |
| "grad_norm": 34.35457992553711, | |
| "learning_rate": 6.5026128205881235e-06, | |
| "loss": 3.2552, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 45.96, | |
| "grad_norm": 42.78106689453125, | |
| "learning_rate": 6.458162070920059e-06, | |
| "loss": 3.6029, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 46.256, | |
| "grad_norm": 41.89685821533203, | |
| "learning_rate": 6.41807234774012e-06, | |
| "loss": 3.1435, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 46.576, | |
| "grad_norm": 35.95709991455078, | |
| "learning_rate": 6.373439013960264e-06, | |
| "loss": 3.0505, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 46.896, | |
| "grad_norm": 40.73233413696289, | |
| "learning_rate": 6.328716208581277e-06, | |
| "loss": 3.2412, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 47.192, | |
| "grad_norm": 38.4586296081543, | |
| "learning_rate": 6.288392971723836e-06, | |
| "loss": 3.0233, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 47.512, | |
| "grad_norm": 39.029319763183594, | |
| "learning_rate": 6.243512798779037e-06, | |
| "loss": 3.2573, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 47.832, | |
| "grad_norm": 43.8737678527832, | |
| "learning_rate": 6.198556463356931e-06, | |
| "loss": 3.1004, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 48.128, | |
| "grad_norm": 41.60302734375, | |
| "learning_rate": 6.1535285706047075e-06, | |
| "loss": 2.905, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 48.448, | |
| "grad_norm": 41.740875244140625, | |
| "learning_rate": 6.108433732999604e-06, | |
| "loss": 3.1313, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 48.768, | |
| "grad_norm": 38.86418151855469, | |
| "learning_rate": 6.063276569876421e-06, | |
| "loss": 3.115, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 49.064, | |
| "grad_norm": 42.09383010864258, | |
| "learning_rate": 6.022585657780215e-06, | |
| "loss": 2.8219, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 49.384, | |
| "grad_norm": 31.668691635131836, | |
| "learning_rate": 5.9773228249238725e-06, | |
| "loss": 2.9306, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 49.704, | |
| "grad_norm": 40.560951232910156, | |
| "learning_rate": 5.932011097026472e-06, | |
| "loss": 3.0078, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "grad_norm": 39.417659759521484, | |
| "learning_rate": 5.891192572805754e-06, | |
| "loss": 2.7878, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 50.32, | |
| "grad_norm": 38.46611022949219, | |
| "learning_rate": 5.84580073556307e-06, | |
| "loss": 3.0789, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 50.64, | |
| "grad_norm": 43.09978485107422, | |
| "learning_rate": 5.80037347587612e-06, | |
| "loss": 2.9716, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 50.96, | |
| "grad_norm": 37.992034912109375, | |
| "learning_rate": 5.754915447131693e-06, | |
| "loss": 3.1029, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 51.256, | |
| "grad_norm": 34.79606628417969, | |
| "learning_rate": 5.713980762289322e-06, | |
| "loss": 2.7127, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 51.576, | |
| "grad_norm": 34.46636962890625, | |
| "learning_rate": 5.668477103320299e-06, | |
| "loss": 2.9521, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 51.896, | |
| "grad_norm": 39.093299865722656, | |
| "learning_rate": 5.6229561862306635e-06, | |
| "loss": 2.8132, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 52.192, | |
| "grad_norm": 36.822364807128906, | |
| "learning_rate": 5.577422674001093e-06, | |
| "loss": 2.6165, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 52.512, | |
| "grad_norm": 37.18339157104492, | |
| "learning_rate": 5.531881230902461e-06, | |
| "loss": 2.8924, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 52.832, | |
| "grad_norm": 34.324920654296875, | |
| "learning_rate": 5.486336522018052e-06, | |
| "loss": 2.7838, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 53.128, | |
| "grad_norm": 34.57423782348633, | |
| "learning_rate": 5.445347347743854e-06, | |
| "loss": 2.5569, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 53.448, | |
| "grad_norm": 34.08185958862305, | |
| "learning_rate": 5.399809286983922e-06, | |
| "loss": 2.6638, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 53.768, | |
| "grad_norm": 35.969120025634766, | |
| "learning_rate": 5.354281489359898e-06, | |
| "loss": 2.8466, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 54.064, | |
| "grad_norm": 36.20486831665039, | |
| "learning_rate": 5.313319101044968e-06, | |
| "loss": 2.6815, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 54.384, | |
| "grad_norm": 34.20916748046875, | |
| "learning_rate": 5.267823650578826e-06, | |
| "loss": 2.8057, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 54.704, | |
| "grad_norm": 35.21307373046875, | |
| "learning_rate": 5.222351983329293e-06, | |
| "loss": 2.5881, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 55.0, | |
| "grad_norm": 37.0841064453125, | |
| "learning_rate": 5.181451667296815e-06, | |
| "loss": 2.5513, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 55.32, | |
| "grad_norm": 32.83190155029297, | |
| "learning_rate": 5.136038018373682e-06, | |
| "loss": 2.7037, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 55.64, | |
| "grad_norm": 36.691349029541016, | |
| "learning_rate": 5.0906616522602225e-06, | |
| "loss": 2.4978, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 55.96, | |
| "grad_norm": 37.95656204223633, | |
| "learning_rate": 5.045327217129888e-06, | |
| "loss": 2.7158, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 56.256, | |
| "grad_norm": 37.772701263427734, | |
| "learning_rate": 5.000039356860898e-06, | |
| "loss": 2.458, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 56.576, | |
| "grad_norm": 33.042057037353516, | |
| "learning_rate": 4.954802710560514e-06, | |
| "loss": 2.6958, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 56.896, | |
| "grad_norm": 34.64514923095703, | |
| "learning_rate": 4.909621912089858e-06, | |
| "loss": 2.6069, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 57.192, | |
| "grad_norm": 30.218544006347656, | |
| "learning_rate": 4.869010768644479e-06, | |
| "loss": 2.4799, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 57.512, | |
| "grad_norm": 33.19715881347656, | |
| "learning_rate": 4.82394882646648e-06, | |
| "loss": 2.638, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 57.832, | |
| "grad_norm": 34.21620178222656, | |
| "learning_rate": 4.778956136266724e-06, | |
| "loss": 2.5429, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 58.128, | |
| "grad_norm": 30.74477195739746, | |
| "learning_rate": 4.738525734914598e-06, | |
| "loss": 2.2018, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 58.448, | |
| "grad_norm": 29.80803680419922, | |
| "learning_rate": 4.693677314670168e-06, | |
| "loss": 2.5071, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 58.768, | |
| "grad_norm": 28.387727737426758, | |
| "learning_rate": 4.648911490896609e-06, | |
| "loss": 2.5778, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 59.064, | |
| "grad_norm": 38.00436782836914, | |
| "learning_rate": 4.608696659690191e-06, | |
| "loss": 2.3583, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 59.384, | |
| "grad_norm": 28.832168579101562, | |
| "learning_rate": 4.5641003952558335e-06, | |
| "loss": 2.4408, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 59.704, | |
| "grad_norm": 30.9281005859375, | |
| "learning_rate": 4.519600000633706e-06, | |
| "loss": 2.572, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 60.0, | |
| "grad_norm": 29.808483123779297, | |
| "learning_rate": 4.4752000342662435e-06, | |
| "loss": 2.2828, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 60.32, | |
| "grad_norm": 30.235679626464844, | |
| "learning_rate": 4.430905044308421e-06, | |
| "loss": 2.4467, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 60.64, | |
| "grad_norm": 34.53474044799805, | |
| "learning_rate": 4.386719568161842e-06, | |
| "loss": 2.6138, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 60.96, | |
| "grad_norm": 28.18803596496582, | |
| "learning_rate": 4.342648132009955e-06, | |
| "loss": 2.3741, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 61.256, | |
| "grad_norm": 36.68701934814453, | |
| "learning_rate": 4.3030850751531636e-06, | |
| "loss": 2.265, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 61.576, | |
| "grad_norm": 28.751218795776367, | |
| "learning_rate": 4.259242742415551e-06, | |
| "loss": 2.323, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 61.896, | |
| "grad_norm": 36.796180725097656, | |
| "learning_rate": 4.215527507888797e-06, | |
| "loss": 2.4313, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 62.192, | |
| "grad_norm": 32.65000534057617, | |
| "learning_rate": 4.1762961671398e-06, | |
| "loss": 2.2107, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 62.512, | |
| "grad_norm": 29.814340591430664, | |
| "learning_rate": 4.1328347450114085e-06, | |
| "loss": 2.3156, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 62.832, | |
| "grad_norm": 30.403644561767578, | |
| "learning_rate": 4.08951336982354e-06, | |
| "loss": 2.4868, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 63.128, | |
| "grad_norm": 30.64937400817871, | |
| "learning_rate": 4.050647540325295e-06, | |
| "loss": 2.1469, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 63.448, | |
| "grad_norm": 33.04718017578125, | |
| "learning_rate": 4.0076044678776885e-06, | |
| "loss": 2.2953, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 63.768, | |
| "grad_norm": 26.519309997558594, | |
| "learning_rate": 3.964714270459319e-06, | |
| "loss": 2.2162, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 64.064, | |
| "grad_norm": 30.9384822845459, | |
| "learning_rate": 3.921981341570459e-06, | |
| "loss": 2.1822, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 64.384, | |
| "grad_norm": 27.196205139160156, | |
| "learning_rate": 3.87941005860143e-06, | |
| "loss": 2.2589, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 64.704, | |
| "grad_norm": 29.418684005737305, | |
| "learning_rate": 3.837004782384188e-06, | |
| "loss": 2.2247, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 65.0, | |
| "grad_norm": 31.906282424926758, | |
| "learning_rate": 3.798985560092915e-06, | |
| "loss": 2.122, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 65.32, | |
| "grad_norm": 30.91475486755371, | |
| "learning_rate": 3.7569076495338595e-06, | |
| "loss": 2.2306, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 65.64, | |
| "grad_norm": 29.103853225708008, | |
| "learning_rate": 3.7150082943834076e-06, | |
| "loss": 2.3974, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 65.96, | |
| "grad_norm": 30.623212814331055, | |
| "learning_rate": 3.673291786643919e-06, | |
| "loss": 2.1196, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 66.256, | |
| "grad_norm": 36.47549819946289, | |
| "learning_rate": 3.635906796468296e-06, | |
| "loss": 2.0359, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 66.576, | |
| "grad_norm": 32.82271957397461, | |
| "learning_rate": 3.5945494558412943e-06, | |
| "loss": 2.1794, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 66.896, | |
| "grad_norm": 28.202482223510742, | |
| "learning_rate": 3.5533873019468813e-06, | |
| "loss": 2.1184, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 67.192, | |
| "grad_norm": 25.35958480834961, | |
| "learning_rate": 3.5165117334443868e-06, | |
| "loss": 2.019, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 67.512, | |
| "grad_norm": 32.23173141479492, | |
| "learning_rate": 3.4757320338886956e-06, | |
| "loss": 2.1757, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 67.832, | |
| "grad_norm": 29.19985008239746, | |
| "learning_rate": 3.4351596922470677e-06, | |
| "loss": 2.165, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 68.128, | |
| "grad_norm": 33.79531478881836, | |
| "learning_rate": 3.394798864587904e-06, | |
| "loss": 1.9395, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 68.448, | |
| "grad_norm": 25.360973358154297, | |
| "learning_rate": 3.3546536853129603e-06, | |
| "loss": 2.1175, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 68.768, | |
| "grad_norm": 31.0283145904541, | |
| "learning_rate": 3.314728266733837e-06, | |
| "loss": 2.1479, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 69.064, | |
| "grad_norm": 28.047056198120117, | |
| "learning_rate": 3.2789866660993095e-06, | |
| "loss": 1.9538, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 69.384, | |
| "grad_norm": 29.004770278930664, | |
| "learning_rate": 3.2394900412984527e-06, | |
| "loss": 2.1279, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 69.704, | |
| "grad_norm": 29.955669403076172, | |
| "learning_rate": 3.20022497409649e-06, | |
| "loss": 2.1065, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 70.0, | |
| "grad_norm": 25.94803810119629, | |
| "learning_rate": 3.1650877201708363e-06, | |
| "loss": 1.975, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 70.32, | |
| "grad_norm": 27.678482055664062, | |
| "learning_rate": 3.126273673511262e-06, | |
| "loss": 2.0469, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 70.64, | |
| "grad_norm": 26.973514556884766, | |
| "learning_rate": 3.0877027818824744e-06, | |
| "loss": 1.9968, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 70.96, | |
| "grad_norm": 25.811176300048828, | |
| "learning_rate": 3.0493789963323385e-06, | |
| "loss": 2.0934, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 71.256, | |
| "grad_norm": 24.189441680908203, | |
| "learning_rate": 3.015102110184209e-06, | |
| "loss": 1.8912, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 71.576, | |
| "grad_norm": 28.846141815185547, | |
| "learning_rate": 2.9772586203393617e-06, | |
| "loss": 1.9996, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 71.896, | |
| "grad_norm": 29.31644630432129, | |
| "learning_rate": 2.939673550031339e-06, | |
| "loss": 2.1061, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 72.192, | |
| "grad_norm": 24.74772071838379, | |
| "learning_rate": 2.902350749324414e-06, | |
| "loss": 1.8613, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 72.512, | |
| "grad_norm": 26.519685745239258, | |
| "learning_rate": 2.8652940414170094e-06, | |
| "loss": 1.9042, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 72.832, | |
| "grad_norm": 27.277027130126953, | |
| "learning_rate": 2.828507222250073e-06, | |
| "loss": 2.1886, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 73.128, | |
| "grad_norm": 25.948633193969727, | |
| "learning_rate": 2.795632954945905e-06, | |
| "loss": 1.8885, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 73.448, | |
| "grad_norm": 27.202346801757812, | |
| "learning_rate": 2.7593692828767417e-06, | |
| "loss": 2.0617, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 73.768, | |
| "grad_norm": 25.095088958740234, | |
| "learning_rate": 2.723386350056646e-06, | |
| "loss": 2.0202, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 74.064, | |
| "grad_norm": 27.66188621520996, | |
| "learning_rate": 2.6912447896126063e-06, | |
| "loss": 1.9708, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 74.384, | |
| "grad_norm": 27.14434814453125, | |
| "learning_rate": 2.655805392079069e-06, | |
| "loss": 1.9453, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 74.704, | |
| "grad_norm": 27.399051666259766, | |
| "learning_rate": 2.6206573424653525e-06, | |
| "loss": 1.85, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 75.0, | |
| "grad_norm": 27.855464935302734, | |
| "learning_rate": 2.5892761766588475e-06, | |
| "loss": 1.9639, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 75.32, | |
| "grad_norm": 22.55164909362793, | |
| "learning_rate": 2.554691582301635e-06, | |
| "loss": 1.8897, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 75.64, | |
| "grad_norm": 28.89771842956543, | |
| "learning_rate": 2.5204086935584972e-06, | |
| "loss": 2.0627, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 75.96, | |
| "grad_norm": 27.81890106201172, | |
| "learning_rate": 2.486431022231434e-06, | |
| "loss": 1.9662, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 76.256, | |
| "grad_norm": 23.57813835144043, | |
| "learning_rate": 2.4527620488572007e-06, | |
| "loss": 1.8852, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 76.576, | |
| "grad_norm": 28.515785217285156, | |
| "learning_rate": 2.4194052223507637e-06, | |
| "loss": 1.9537, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 76.896, | |
| "grad_norm": 21.694969177246094, | |
| "learning_rate": 2.3863639596520146e-06, | |
| "loss": 1.9615, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 77.192, | |
| "grad_norm": 22.404096603393555, | |
| "learning_rate": 2.3568994283363583e-06, | |
| "loss": 1.7868, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 77.512, | |
| "grad_norm": 22.37274932861328, | |
| "learning_rate": 2.324467034489848e-06, | |
| "loss": 1.92, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 77.832, | |
| "grad_norm": 22.426904678344727, | |
| "learning_rate": 2.29235992953927e-06, | |
| "loss": 1.9547, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 78.128, | |
| "grad_norm": 27.522668838500977, | |
| "learning_rate": 2.2637443760789118e-06, | |
| "loss": 1.8528, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 78.448, | |
| "grad_norm": 28.163259506225586, | |
| "learning_rate": 2.232264353203013e-06, | |
| "loss": 1.9261, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 78.768, | |
| "grad_norm": 25.449567794799805, | |
| "learning_rate": 2.2011190640969375e-06, | |
| "loss": 1.9074, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 79.064, | |
| "grad_norm": 23.236602783203125, | |
| "learning_rate": 2.173377138853083e-06, | |
| "loss": 1.821, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 79.384, | |
| "grad_norm": 25.74871826171875, | |
| "learning_rate": 2.142876604855393e-06, | |
| "loss": 1.9702, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 79.704, | |
| "grad_norm": 27.608022689819336, | |
| "learning_rate": 2.1127199611509476e-06, | |
| "loss": 1.8918, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 80.0, | |
| "grad_norm": 26.690242767333984, | |
| "learning_rate": 2.0829102968657067e-06, | |
| "loss": 1.7362, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 80.32, | |
| "grad_norm": 25.989137649536133, | |
| "learning_rate": 2.0534506655824563e-06, | |
| "loss": 1.8662, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 80.64, | |
| "grad_norm": 26.44927215576172, | |
| "learning_rate": 2.024344085027995e-06, | |
| "loss": 1.8962, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 80.96, | |
| "grad_norm": 26.649433135986328, | |
| "learning_rate": 1.9955935367640172e-06, | |
| "loss": 1.9202, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 81.256, | |
| "grad_norm": 25.331037521362305, | |
| "learning_rate": 1.9700248858111003e-06, | |
| "loss": 1.781, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 81.576, | |
| "grad_norm": 27.306407928466797, | |
| "learning_rate": 1.941958882239165e-06, | |
| "loss": 1.9203, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 81.896, | |
| "grad_norm": 22.897323608398438, | |
| "learning_rate": 1.914257350168377e-06, | |
| "loss": 1.8839, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 82.192, | |
| "grad_norm": 23.076894760131836, | |
| "learning_rate": 1.8896399405188986e-06, | |
| "loss": 1.6662, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 82.512, | |
| "grad_norm": 23.84672737121582, | |
| "learning_rate": 1.8626386908906847e-06, | |
| "loss": 1.8701, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 82.832, | |
| "grad_norm": 25.28089714050293, | |
| "learning_rate": 1.8360100380067636e-06, | |
| "loss": 1.9854, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 83.128, | |
| "grad_norm": 24.182842254638672, | |
| "learning_rate": 1.8123650758918262e-06, | |
| "loss": 1.6976, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 83.448, | |
| "grad_norm": 26.943639755249023, | |
| "learning_rate": 1.7864518399551195e-06, | |
| "loss": 2.0291, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 83.768, | |
| "grad_norm": 22.0482234954834, | |
| "learning_rate": 1.7609190050389946e-06, | |
| "loss": 1.8355, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 84.064, | |
| "grad_norm": 28.498458862304688, | |
| "learning_rate": 1.7357691866249377e-06, | |
| "loss": 1.7042, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 84.384, | |
| "grad_norm": 23.913034439086914, | |
| "learning_rate": 1.7110049609597555e-06, | |
| "loss": 1.8195, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 84.704, | |
| "grad_norm": 24.15852928161621, | |
| "learning_rate": 1.6866288647916768e-06, | |
| "loss": 1.8004, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 85.0, | |
| "grad_norm": 26.641592025756836, | |
| "learning_rate": 1.665024293548139e-06, | |
| "loss": 1.7754, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 85.32, | |
| "grad_norm": 25.73943328857422, | |
| "learning_rate": 1.641392489564914e-06, | |
| "loss": 1.863, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 85.64, | |
| "grad_norm": 21.554733276367188, | |
| "learning_rate": 1.6181559459019353e-06, | |
| "loss": 1.8378, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 85.96, | |
| "grad_norm": 23.64627456665039, | |
| "learning_rate": 1.5953170428177997e-06, | |
| "loss": 1.8041, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 86.256, | |
| "grad_norm": 27.766212463378906, | |
| "learning_rate": 1.5751039472006447e-06, | |
| "loss": 1.7524, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 86.576, | |
| "grad_norm": 19.842952728271484, | |
| "learning_rate": 1.5530269727513447e-06, | |
| "loss": 1.8589, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 86.896, | |
| "grad_norm": 23.708539962768555, | |
| "learning_rate": 1.531354310432403e-06, | |
| "loss": 1.7687, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 87.192, | |
| "grad_norm": 27.59642791748047, | |
| "learning_rate": 1.5121964369565362e-06, | |
| "loss": 1.7757, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 87.512, | |
| "grad_norm": 23.417882919311523, | |
| "learning_rate": 1.4912980495391515e-06, | |
| "loss": 1.8026, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 87.832, | |
| "grad_norm": 23.49812889099121, | |
| "learning_rate": 1.4708102975161232e-06, | |
| "loss": 1.7793, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 88.128, | |
| "grad_norm": 20.413761138916016, | |
| "learning_rate": 1.450735279570825e-06, | |
| "loss": 1.644, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 88.448, | |
| "grad_norm": 20.448760986328125, | |
| "learning_rate": 1.431075052107801e-06, | |
| "loss": 1.7853, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 88.768, | |
| "grad_norm": 24.541154861450195, | |
| "learning_rate": 1.411831629042118e-06, | |
| "loss": 1.85, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 89.064, | |
| "grad_norm": 25.394201278686523, | |
| "learning_rate": 1.3948705461409444e-06, | |
| "loss": 1.6874, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 89.384, | |
| "grad_norm": 23.580543518066406, | |
| "learning_rate": 1.3764244466885141e-06, | |
| "loss": 1.8601, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 89.704, | |
| "grad_norm": 23.068803787231445, | |
| "learning_rate": 1.3584007498228602e-06, | |
| "loss": 1.813, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 90.0, | |
| "grad_norm": 24.027877807617188, | |
| "learning_rate": 1.3425421036992098e-06, | |
| "loss": 1.7416, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 90.32, | |
| "grad_norm": 27.069442749023438, | |
| "learning_rate": 1.3253260223227138e-06, | |
| "loss": 1.715, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 90.64, | |
| "grad_norm": 22.877151489257812, | |
| "learning_rate": 1.30853757785121e-06, | |
| "loss": 1.8249, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 90.96, | |
| "grad_norm": 24.492151260375977, | |
| "learning_rate": 1.292178490025803e-06, | |
| "loss": 1.7188, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 91.256, | |
| "grad_norm": 20.234020233154297, | |
| "learning_rate": 1.2778237968421458e-06, | |
| "loss": 1.5356, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 91.576, | |
| "grad_norm": 25.770517349243164, | |
| "learning_rate": 1.2622850668726402e-06, | |
| "loss": 1.6071, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 91.896, | |
| "grad_norm": 28.51846694946289, | |
| "learning_rate": 1.2471804314714615e-06, | |
| "loss": 1.7475, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 92.192, | |
| "grad_norm": 22.339611053466797, | |
| "learning_rate": 1.232511437897046e-06, | |
| "loss": 1.6772, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 92.512, | |
| "grad_norm": 25.455184936523438, | |
| "learning_rate": 1.2182795887824252e-06, | |
| "loss": 1.9423, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 92.832, | |
| "grad_norm": 22.2073974609375, | |
| "learning_rate": 1.2044863419812993e-06, | |
| "loss": 1.6468, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 93.128, | |
| "grad_norm": 23.767728805541992, | |
| "learning_rate": 1.1924485935515493e-06, | |
| "loss": 1.6161, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 93.448, | |
| "grad_norm": 22.131113052368164, | |
| "learning_rate": 1.179492546504283e-06, | |
| "loss": 1.7268, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 93.768, | |
| "grad_norm": 20.83936882019043, | |
| "learning_rate": 1.1669790749599883e-06, | |
| "loss": 1.6866, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 94.064, | |
| "grad_norm": 21.060152053833008, | |
| "learning_rate": 1.1560964129965143e-06, | |
| "loss": 1.6455, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 94.384, | |
| "grad_norm": 26.631736755371094, | |
| "learning_rate": 1.144427328773634e-06, | |
| "loss": 1.7459, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 94.704, | |
| "grad_norm": 22.8407039642334, | |
| "learning_rate": 1.1332044119943799e-06, | |
| "loss": 1.7396, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 95.0, | |
| "grad_norm": 24.34178352355957, | |
| "learning_rate": 1.1234862111733938e-06, | |
| "loss": 1.5881, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 95.32, | |
| "grad_norm": 23.25417709350586, | |
| "learning_rate": 1.1131141419006206e-06, | |
| "loss": 1.6469, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 95.64, | |
| "grad_norm": 21.618032455444336, | |
| "learning_rate": 1.1031914476690958e-06, | |
| "loss": 1.7556, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 95.96, | |
| "grad_norm": 25.758623123168945, | |
| "learning_rate": 1.093719144919928e-06, | |
| "loss": 1.745, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 96.256, | |
| "grad_norm": 24.91753578186035, | |
| "learning_rate": 1.0846982039579245e-06, | |
| "loss": 1.6264, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 96.576, | |
| "grad_norm": 23.212156295776367, | |
| "learning_rate": 1.0761295488521977e-06, | |
| "loss": 1.7324, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 96.896, | |
| "grad_norm": 24.846576690673828, | |
| "learning_rate": 1.0680140573415042e-06, | |
| "loss": 1.7467, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 97.192, | |
| "grad_norm": 24.324583053588867, | |
| "learning_rate": 1.0610982579009123e-06, | |
| "loss": 1.5182, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 97.512, | |
| "grad_norm": 26.072097778320312, | |
| "learning_rate": 1.0538460290681764e-06, | |
| "loss": 1.7966, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 97.832, | |
| "grad_norm": 21.74355125427246, | |
| "learning_rate": 1.0470492464651147e-06, | |
| "loss": 1.6975, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 98.128, | |
| "grad_norm": 24.43443489074707, | |
| "learning_rate": 1.0413221250522967e-06, | |
| "loss": 1.6646, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 98.448, | |
| "grad_norm": 21.463241577148438, | |
| "learning_rate": 1.0353925697965052e-06, | |
| "loss": 1.7691, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 98.768, | |
| "grad_norm": 21.25942039489746, | |
| "learning_rate": 1.0299203510689471e-06, | |
| "loss": 1.7827, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 99.064, | |
| "grad_norm": 23.53223419189453, | |
| "learning_rate": 1.0253868412111096e-06, | |
| "loss": 1.6516, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 99.384, | |
| "grad_norm": 21.707815170288086, | |
| "learning_rate": 1.020785067442512e-06, | |
| "loss": 1.6417, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 99.704, | |
| "grad_norm": 23.734600067138672, | |
| "learning_rate": 1.0166421265362642e-06, | |
| "loss": 1.7752, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 100.0, | |
| "grad_norm": 22.472688674926758, | |
| "learning_rate": 1.0129584428786632e-06, | |
| "loss": 1.6273, | |
| "step": 3200 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3200, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 100, | |
| "save_steps": 5000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |