| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 734, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.013623978201634877, | |
| "grad_norm": 1.2644609212875366, | |
| "learning_rate": 1.3043478260869566e-06, | |
| "loss": 1.3371, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.027247956403269755, | |
| "grad_norm": 0.9776316285133362, | |
| "learning_rate": 2.9347826086956523e-06, | |
| "loss": 1.3342, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04087193460490463, | |
| "grad_norm": 0.8481394052505493, | |
| "learning_rate": 4.565217391304348e-06, | |
| "loss": 1.3057, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.05449591280653951, | |
| "grad_norm": 0.6515122652053833, | |
| "learning_rate": 6.195652173913044e-06, | |
| "loss": 1.2589, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0681198910081744, | |
| "grad_norm": 0.5874186158180237, | |
| "learning_rate": 7.826086956521738e-06, | |
| "loss": 1.3107, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.08174386920980926, | |
| "grad_norm": 0.5392442345619202, | |
| "learning_rate": 9.456521739130436e-06, | |
| "loss": 1.2746, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09536784741144415, | |
| "grad_norm": 0.47512176632881165, | |
| "learning_rate": 1.108695652173913e-05, | |
| "loss": 1.187, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.10899182561307902, | |
| "grad_norm": 0.4385203421115875, | |
| "learning_rate": 1.2717391304347827e-05, | |
| "loss": 1.2059, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1226158038147139, | |
| "grad_norm": 0.6362432241439819, | |
| "learning_rate": 1.4347826086956522e-05, | |
| "loss": 1.1893, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.1362397820163488, | |
| "grad_norm": 0.46012377738952637, | |
| "learning_rate": 1.597826086956522e-05, | |
| "loss": 1.2839, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.14986376021798364, | |
| "grad_norm": 0.5726909637451172, | |
| "learning_rate": 1.7608695652173915e-05, | |
| "loss": 1.1741, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.16348773841961853, | |
| "grad_norm": 0.4440523684024811, | |
| "learning_rate": 1.9239130434782607e-05, | |
| "loss": 1.2477, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1771117166212534, | |
| "grad_norm": 0.4722035229206085, | |
| "learning_rate": 2.0869565217391306e-05, | |
| "loss": 1.1941, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.1907356948228883, | |
| "grad_norm": 0.45267796516418457, | |
| "learning_rate": 2.25e-05, | |
| "loss": 1.2442, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.20435967302452315, | |
| "grad_norm": 0.5661253929138184, | |
| "learning_rate": 2.4130434782608697e-05, | |
| "loss": 1.2069, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.21798365122615804, | |
| "grad_norm": 0.46782588958740234, | |
| "learning_rate": 2.5760869565217392e-05, | |
| "loss": 1.1675, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.23160762942779292, | |
| "grad_norm": 0.48947134613990784, | |
| "learning_rate": 2.7391304347826085e-05, | |
| "loss": 1.1297, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2452316076294278, | |
| "grad_norm": 0.5822666883468628, | |
| "learning_rate": 2.9021739130434783e-05, | |
| "loss": 1.1361, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.25885558583106266, | |
| "grad_norm": 0.48768264055252075, | |
| "learning_rate": 2.9999902540146195e-05, | |
| "loss": 1.1555, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.2724795640326976, | |
| "grad_norm": 0.5380802750587463, | |
| "learning_rate": 2.999880613133526e-05, | |
| "loss": 1.1335, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.28610354223433243, | |
| "grad_norm": 0.5643327832221985, | |
| "learning_rate": 2.9996491578238983e-05, | |
| "loss": 1.0905, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.2997275204359673, | |
| "grad_norm": 0.5208688974380493, | |
| "learning_rate": 2.9992959068836304e-05, | |
| "loss": 1.1015, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3133514986376022, | |
| "grad_norm": 0.5542858839035034, | |
| "learning_rate": 2.99882088900238e-05, | |
| "loss": 1.0951, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.32697547683923706, | |
| "grad_norm": 0.5248655080795288, | |
| "learning_rate": 2.9982241427592387e-05, | |
| "loss": 1.0697, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3405994550408719, | |
| "grad_norm": 0.6014358401298523, | |
| "learning_rate": 2.997505716619599e-05, | |
| "loss": 1.0873, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.3542234332425068, | |
| "grad_norm": 0.6086240410804749, | |
| "learning_rate": 2.996665668931218e-05, | |
| "loss": 1.0565, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3678474114441417, | |
| "grad_norm": 0.6661133766174316, | |
| "learning_rate": 2.9957040679194782e-05, | |
| "loss": 1.0233, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.3814713896457766, | |
| "grad_norm": 0.707312285900116, | |
| "learning_rate": 2.9946209916818477e-05, | |
| "loss": 1.0613, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.39509536784741145, | |
| "grad_norm": 0.5628401041030884, | |
| "learning_rate": 2.9934165281815363e-05, | |
| "loss": 0.9882, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.4087193460490463, | |
| "grad_norm": 0.6965730786323547, | |
| "learning_rate": 2.9920907752403513e-05, | |
| "loss": 0.9984, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4223433242506812, | |
| "grad_norm": 0.7086770534515381, | |
| "learning_rate": 2.9906438405307548e-05, | |
| "loss": 0.9605, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.4359673024523161, | |
| "grad_norm": 0.6676215529441833, | |
| "learning_rate": 2.989075841567115e-05, | |
| "loss": 1.0023, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.44959128065395093, | |
| "grad_norm": 0.8493645191192627, | |
| "learning_rate": 2.987386905696167e-05, | |
| "loss": 0.9039, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.46321525885558584, | |
| "grad_norm": 0.7191267609596252, | |
| "learning_rate": 2.9855771700866665e-05, | |
| "loss": 0.9652, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.4768392370572207, | |
| "grad_norm": 0.7579267621040344, | |
| "learning_rate": 2.983646781718251e-05, | |
| "loss": 0.9237, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.4904632152588556, | |
| "grad_norm": 0.8027481436729431, | |
| "learning_rate": 2.9815958973695034e-05, | |
| "loss": 0.9653, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5040871934604905, | |
| "grad_norm": 0.8199446797370911, | |
| "learning_rate": 2.9794246836052167e-05, | |
| "loss": 0.9864, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.5177111716621253, | |
| "grad_norm": 0.7790918946266174, | |
| "learning_rate": 2.977133316762869e-05, | |
| "loss": 0.9403, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5313351498637602, | |
| "grad_norm": 0.7630456686019897, | |
| "learning_rate": 2.9747219829382997e-05, | |
| "loss": 0.9161, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5449591280653951, | |
| "grad_norm": 0.8171895742416382, | |
| "learning_rate": 2.9721908779705974e-05, | |
| "loss": 0.8556, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.55858310626703, | |
| "grad_norm": 0.8095611929893494, | |
| "learning_rate": 2.969540207426193e-05, | |
| "loss": 0.9051, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.5722070844686649, | |
| "grad_norm": 0.7118027806282043, | |
| "learning_rate": 2.9667701865821666e-05, | |
| "loss": 0.9304, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5858310626702997, | |
| "grad_norm": 0.897177517414093, | |
| "learning_rate": 2.9638810404087603e-05, | |
| "loss": 0.87, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.5994550408719346, | |
| "grad_norm": 0.7748181819915771, | |
| "learning_rate": 2.960873003551111e-05, | |
| "loss": 0.8723, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6130790190735694, | |
| "grad_norm": 0.8167896270751953, | |
| "learning_rate": 2.9577463203101897e-05, | |
| "loss": 0.8648, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.6267029972752044, | |
| "grad_norm": 0.8939515352249146, | |
| "learning_rate": 2.9545012446229613e-05, | |
| "loss": 0.7818, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.6403269754768393, | |
| "grad_norm": 0.7820625305175781, | |
| "learning_rate": 2.951138040041764e-05, | |
| "loss": 0.9032, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.6539509536784741, | |
| "grad_norm": 0.844601571559906, | |
| "learning_rate": 2.9476569797129e-05, | |
| "loss": 0.8736, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.667574931880109, | |
| "grad_norm": 0.7247692942619324, | |
| "learning_rate": 2.944058346354454e-05, | |
| "loss": 0.8779, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.6811989100817438, | |
| "grad_norm": 0.8070963621139526, | |
| "learning_rate": 2.9403424322333326e-05, | |
| "loss": 0.8503, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6948228882833788, | |
| "grad_norm": 0.8530706167221069, | |
| "learning_rate": 2.9365095391415254e-05, | |
| "loss": 0.8546, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.7084468664850136, | |
| "grad_norm": 0.8497718572616577, | |
| "learning_rate": 2.932559978371596e-05, | |
| "loss": 0.78, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.7220708446866485, | |
| "grad_norm": 0.9066139459609985, | |
| "learning_rate": 2.928494070691401e-05, | |
| "loss": 0.829, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.7356948228882834, | |
| "grad_norm": 0.7985149621963501, | |
| "learning_rate": 2.9243121463180362e-05, | |
| "loss": 0.8262, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7493188010899182, | |
| "grad_norm": 1.0057857036590576, | |
| "learning_rate": 2.9200145448910184e-05, | |
| "loss": 0.7681, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.7629427792915532, | |
| "grad_norm": 0.9126638770103455, | |
| "learning_rate": 2.915601615444703e-05, | |
| "loss": 0.8171, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.776566757493188, | |
| "grad_norm": 0.8743240237236023, | |
| "learning_rate": 2.9110737163799347e-05, | |
| "loss": 0.7672, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.7901907356948229, | |
| "grad_norm": 0.8928736448287964, | |
| "learning_rate": 2.9064312154349395e-05, | |
| "loss": 0.7824, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.8038147138964578, | |
| "grad_norm": 1.0869660377502441, | |
| "learning_rate": 2.9016744896554606e-05, | |
| "loss": 0.7687, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.8174386920980926, | |
| "grad_norm": 0.9520823955535889, | |
| "learning_rate": 2.8968039253641347e-05, | |
| "loss": 0.7603, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8310626702997275, | |
| "grad_norm": 0.9313263297080994, | |
| "learning_rate": 2.8918199181291154e-05, | |
| "loss": 0.7344, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.8446866485013624, | |
| "grad_norm": 0.8860709071159363, | |
| "learning_rate": 2.8867228727319484e-05, | |
| "loss": 0.7221, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.8583106267029973, | |
| "grad_norm": 1.0076904296875, | |
| "learning_rate": 2.8815132031346967e-05, | |
| "loss": 0.7163, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.8719346049046321, | |
| "grad_norm": 1.0047346353530884, | |
| "learning_rate": 2.8761913324463193e-05, | |
| "loss": 0.731, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.885558583106267, | |
| "grad_norm": 0.9905893802642822, | |
| "learning_rate": 2.8707576928883083e-05, | |
| "loss": 0.7234, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.8991825613079019, | |
| "grad_norm": 0.8848779797554016, | |
| "learning_rate": 2.8652127257595852e-05, | |
| "loss": 0.7241, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.9128065395095368, | |
| "grad_norm": 1.086945652961731, | |
| "learning_rate": 2.8595568814006618e-05, | |
| "loss": 0.7474, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.9264305177111717, | |
| "grad_norm": 1.058239221572876, | |
| "learning_rate": 2.853790619157063e-05, | |
| "loss": 0.6512, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.9400544959128065, | |
| "grad_norm": 0.9978867173194885, | |
| "learning_rate": 2.8479144073420237e-05, | |
| "loss": 0.6968, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.9536784741144414, | |
| "grad_norm": 0.9755434989929199, | |
| "learning_rate": 2.841928723198449e-05, | |
| "loss": 0.6774, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.9673024523160763, | |
| "grad_norm": 0.9939659833908081, | |
| "learning_rate": 2.835834052860162e-05, | |
| "loss": 0.6851, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.9809264305177112, | |
| "grad_norm": 0.919965922832489, | |
| "learning_rate": 2.8296308913124137e-05, | |
| "loss": 0.6636, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.9945504087193461, | |
| "grad_norm": 1.1084834337234497, | |
| "learning_rate": 2.8233197423516885e-05, | |
| "loss": 0.696, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.008174386920981, | |
| "grad_norm": 1.0081297159194946, | |
| "learning_rate": 2.816901118544785e-05, | |
| "loss": 0.6079, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.021798365122616, | |
| "grad_norm": 0.9290661811828613, | |
| "learning_rate": 2.810375541187188e-05, | |
| "loss": 0.5794, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.0354223433242506, | |
| "grad_norm": 1.0772560834884644, | |
| "learning_rate": 2.80374354026073e-05, | |
| "loss": 0.5495, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.0490463215258856, | |
| "grad_norm": 1.0354938507080078, | |
| "learning_rate": 2.79700565439055e-05, | |
| "loss": 0.6109, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.0626702997275204, | |
| "grad_norm": 1.0381073951721191, | |
| "learning_rate": 2.7901624308013465e-05, | |
| "loss": 0.5849, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.0762942779291553, | |
| "grad_norm": 1.146996021270752, | |
| "learning_rate": 2.7832144252729354e-05, | |
| "loss": 0.5798, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.0899182561307903, | |
| "grad_norm": 1.0819785594940186, | |
| "learning_rate": 2.776162202095111e-05, | |
| "loss": 0.584, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.103542234332425, | |
| "grad_norm": 1.090579628944397, | |
| "learning_rate": 2.7690063340218173e-05, | |
| "loss": 0.556, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.11716621253406, | |
| "grad_norm": 1.034417748451233, | |
| "learning_rate": 2.7617474022246297e-05, | |
| "loss": 0.5788, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.1307901907356948, | |
| "grad_norm": 1.1490421295166016, | |
| "learning_rate": 2.7543859962455576e-05, | |
| "loss": 0.5576, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.1444141689373297, | |
| "grad_norm": 1.0201855897903442, | |
| "learning_rate": 2.7469227139491603e-05, | |
| "loss": 0.5886, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.1580381471389645, | |
| "grad_norm": 1.01875901222229, | |
| "learning_rate": 2.7393581614739924e-05, | |
| "loss": 0.6062, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.1716621253405994, | |
| "grad_norm": 1.0762981176376343, | |
| "learning_rate": 2.7316929531833775e-05, | |
| "loss": 0.5389, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.1852861035422344, | |
| "grad_norm": 1.0140329599380493, | |
| "learning_rate": 2.7239277116155077e-05, | |
| "loss": 0.5462, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.1989100817438691, | |
| "grad_norm": 1.1165515184402466, | |
| "learning_rate": 2.7160630674328893e-05, | |
| "loss": 0.5596, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.2125340599455041, | |
| "grad_norm": 1.0689040422439575, | |
| "learning_rate": 2.7080996593711172e-05, | |
| "loss": 0.5137, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.226158038147139, | |
| "grad_norm": 1.2397655248641968, | |
| "learning_rate": 2.700038134187002e-05, | |
| "loss": 0.5643, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.2397820163487738, | |
| "grad_norm": 0.9885849952697754, | |
| "learning_rate": 2.691879146606043e-05, | |
| "loss": 0.5921, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.2534059945504088, | |
| "grad_norm": 1.279818058013916, | |
| "learning_rate": 2.6836233592692544e-05, | |
| "loss": 0.5126, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.2670299727520435, | |
| "grad_norm": 1.1189254522323608, | |
| "learning_rate": 2.675271442679346e-05, | |
| "loss": 0.5198, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.2806539509536785, | |
| "grad_norm": 1.1345500946044922, | |
| "learning_rate": 2.6668240751462707e-05, | |
| "loss": 0.5117, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.2942779291553133, | |
| "grad_norm": 1.1066192388534546, | |
| "learning_rate": 2.6582819427321313e-05, | |
| "loss": 0.5314, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.3079019073569482, | |
| "grad_norm": 1.0839710235595703, | |
| "learning_rate": 2.649645739195464e-05, | |
| "loss": 0.5382, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.3215258855585832, | |
| "grad_norm": 1.052043080329895, | |
| "learning_rate": 2.640916165934893e-05, | |
| "loss": 0.5135, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.335149863760218, | |
| "grad_norm": 1.1333352327346802, | |
| "learning_rate": 2.6320939319321657e-05, | |
| "loss": 0.5359, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.348773841961853, | |
| "grad_norm": 1.21845543384552, | |
| "learning_rate": 2.623179753694573e-05, | |
| "loss": 0.4853, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.3623978201634879, | |
| "grad_norm": 1.142262578010559, | |
| "learning_rate": 2.614174355196754e-05, | |
| "loss": 0.4993, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.3760217983651226, | |
| "grad_norm": 1.037720799446106, | |
| "learning_rate": 2.6050784678219024e-05, | |
| "loss": 0.512, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.3896457765667574, | |
| "grad_norm": 1.1667251586914062, | |
| "learning_rate": 2.5958928303023634e-05, | |
| "loss": 0.4788, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.4032697547683923, | |
| "grad_norm": 1.046761393547058, | |
| "learning_rate": 2.5866181886596367e-05, | |
| "loss": 0.4867, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.4168937329700273, | |
| "grad_norm": 1.1029340028762817, | |
| "learning_rate": 2.5772552961437893e-05, | |
| "loss": 0.4799, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.430517711171662, | |
| "grad_norm": 1.1235623359680176, | |
| "learning_rate": 2.5678049131722772e-05, | |
| "loss": 0.4752, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.444141689373297, | |
| "grad_norm": 1.064278483390808, | |
| "learning_rate": 2.5582678072681903e-05, | |
| "loss": 0.5173, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.457765667574932, | |
| "grad_norm": 1.13005793094635, | |
| "learning_rate": 2.5486447529979136e-05, | |
| "loss": 0.4451, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.4713896457765667, | |
| "grad_norm": 1.0552195310592651, | |
| "learning_rate": 2.5389365319082226e-05, | |
| "loss": 0.4595, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.4850136239782017, | |
| "grad_norm": 1.1078029870986938, | |
| "learning_rate": 2.5291439324628084e-05, | |
| "loss": 0.4693, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.4986376021798364, | |
| "grad_norm": 1.090710997581482, | |
| "learning_rate": 2.5192677499782413e-05, | |
| "loss": 0.4537, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.5122615803814714, | |
| "grad_norm": 1.037945032119751, | |
| "learning_rate": 2.5093087865593784e-05, | |
| "loss": 0.4556, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.5258855585831061, | |
| "grad_norm": 1.1792460680007935, | |
| "learning_rate": 2.499267851034221e-05, | |
| "loss": 0.4734, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.5395095367847411, | |
| "grad_norm": 1.0996177196502686, | |
| "learning_rate": 2.4891457588882238e-05, | |
| "loss": 0.4444, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.553133514986376, | |
| "grad_norm": 1.13473641872406, | |
| "learning_rate": 2.478943332198062e-05, | |
| "loss": 0.4513, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.5667574931880108, | |
| "grad_norm": 1.212607979774475, | |
| "learning_rate": 2.468661399564871e-05, | |
| "loss": 0.4506, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.5803814713896458, | |
| "grad_norm": 1.0172079801559448, | |
| "learning_rate": 2.458300796046946e-05, | |
| "loss": 0.4238, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.5940054495912808, | |
| "grad_norm": 1.3280726671218872, | |
| "learning_rate": 2.4478623630919236e-05, | |
| "loss": 0.4419, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.6076294277929155, | |
| "grad_norm": 1.1108782291412354, | |
| "learning_rate": 2.437346948468441e-05, | |
| "loss": 0.3942, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.6212534059945503, | |
| "grad_norm": 1.157791256904602, | |
| "learning_rate": 2.4267554061972873e-05, | |
| "loss": 0.397, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.6348773841961854, | |
| "grad_norm": 1.1420488357543945, | |
| "learning_rate": 2.416088596482039e-05, | |
| "loss": 0.4849, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.6485013623978202, | |
| "grad_norm": 1.264007329940796, | |
| "learning_rate": 2.405347385639202e-05, | |
| "loss": 0.4365, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.662125340599455, | |
| "grad_norm": 1.0671672821044922, | |
| "learning_rate": 2.394532646027848e-05, | |
| "loss": 0.4259, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.67574931880109, | |
| "grad_norm": 1.245835781097412, | |
| "learning_rate": 2.3836452559787673e-05, | |
| "loss": 0.4078, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.6893732970027249, | |
| "grad_norm": 1.1798008680343628, | |
| "learning_rate": 2.3726860997231356e-05, | |
| "loss": 0.4319, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.7029972752043596, | |
| "grad_norm": 1.3666014671325684, | |
| "learning_rate": 2.3616560673206984e-05, | |
| "loss": 0.4645, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.7166212534059946, | |
| "grad_norm": 1.0500986576080322, | |
| "learning_rate": 2.3505560545874843e-05, | |
| "loss": 0.4543, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.7302452316076296, | |
| "grad_norm": 1.4002113342285156, | |
| "learning_rate": 2.3393869630230495e-05, | |
| "loss": 0.3978, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.7438692098092643, | |
| "grad_norm": 1.170268177986145, | |
| "learning_rate": 2.3281496997372625e-05, | |
| "loss": 0.4355, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.757493188010899, | |
| "grad_norm": 1.0691126585006714, | |
| "learning_rate": 2.316845177376633e-05, | |
| "loss": 0.4097, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.771117166212534, | |
| "grad_norm": 1.1894110441207886, | |
| "learning_rate": 2.3054743140501877e-05, | |
| "loss": 0.4025, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.784741144414169, | |
| "grad_norm": 1.0559704303741455, | |
| "learning_rate": 2.2940380332549086e-05, | |
| "loss": 0.4237, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.7983651226158037, | |
| "grad_norm": 1.1340513229370117, | |
| "learning_rate": 2.282537263800727e-05, | |
| "loss": 0.3523, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.8119891008174387, | |
| "grad_norm": 1.1120586395263672, | |
| "learning_rate": 2.2709729397350904e-05, | |
| "loss": 0.4037, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.8256130790190737, | |
| "grad_norm": 1.1562973260879517, | |
| "learning_rate": 2.2593460002671024e-05, | |
| "loss": 0.3839, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.8392370572207084, | |
| "grad_norm": 1.2246955633163452, | |
| "learning_rate": 2.247657389691247e-05, | |
| "loss": 0.4188, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.8528610354223434, | |
| "grad_norm": 0.9505524635314941, | |
| "learning_rate": 2.2359080573106913e-05, | |
| "loss": 0.3348, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.8664850136239783, | |
| "grad_norm": 1.161018967628479, | |
| "learning_rate": 2.2240989573601902e-05, | |
| "loss": 0.3978, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.880108991825613, | |
| "grad_norm": 1.076042890548706, | |
| "learning_rate": 2.212231048928587e-05, | |
| "loss": 0.3608, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.8937329700272478, | |
| "grad_norm": 1.248965859413147, | |
| "learning_rate": 2.2003052958809185e-05, | |
| "loss": 0.3621, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.9073569482288828, | |
| "grad_norm": 1.0826005935668945, | |
| "learning_rate": 2.1883226667801374e-05, | |
| "loss": 0.3733, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.9209809264305178, | |
| "grad_norm": 1.1231858730316162, | |
| "learning_rate": 2.1762841348084425e-05, | |
| "loss": 0.3916, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.9346049046321525, | |
| "grad_norm": 1.1095854043960571, | |
| "learning_rate": 2.164190677688248e-05, | |
| "loss": 0.3406, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.9482288828337875, | |
| "grad_norm": 1.0947611331939697, | |
| "learning_rate": 2.1520432776027723e-05, | |
| "loss": 0.3511, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.9618528610354224, | |
| "grad_norm": 1.035023808479309, | |
| "learning_rate": 2.1398429211162706e-05, | |
| "loss": 0.3894, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.9754768392370572, | |
| "grad_norm": 1.166717529296875, | |
| "learning_rate": 2.127590599093909e-05, | |
| "loss": 0.3692, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.989100817438692, | |
| "grad_norm": 1.1927279233932495, | |
| "learning_rate": 2.1152873066212913e-05, | |
| "loss": 0.3278, | |
| "step": 730 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1835, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0749654008583946e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |