| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9990846083431411, |
| "eval_steps": 500, |
| "global_step": 955, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0010461618935530272, |
| "grad_norm": 4.961991027903084, |
| "learning_rate": 2.0833333333333333e-07, |
| "loss": 1.5879, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.005230809467765137, |
| "grad_norm": 4.618544468117898, |
| "learning_rate": 1.0416666666666667e-06, |
| "loss": 1.5933, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.010461618935530274, |
| "grad_norm": 4.037023933749454, |
| "learning_rate": 2.0833333333333334e-06, |
| "loss": 1.5702, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.01569242840329541, |
| "grad_norm": 1.8708269586813249, |
| "learning_rate": 3.125e-06, |
| "loss": 1.5369, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.020923237871060547, |
| "grad_norm": 1.5902570000850822, |
| "learning_rate": 4.166666666666667e-06, |
| "loss": 1.4646, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.026154047338825683, |
| "grad_norm": 1.23920395738266, |
| "learning_rate": 5.208333333333334e-06, |
| "loss": 1.4335, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.03138485680659082, |
| "grad_norm": 1.1663956891784453, |
| "learning_rate": 6.25e-06, |
| "loss": 1.3982, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.03661566627435596, |
| "grad_norm": 0.8974420486494152, |
| "learning_rate": 7.291666666666667e-06, |
| "loss": 1.3919, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.041846475742121095, |
| "grad_norm": 0.8445948684549006, |
| "learning_rate": 8.333333333333334e-06, |
| "loss": 1.3576, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.04707728520988623, |
| "grad_norm": 0.6980197067702758, |
| "learning_rate": 9.375000000000001e-06, |
| "loss": 1.3582, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.052308094677651365, |
| "grad_norm": 0.8409122561540898, |
| "learning_rate": 1.0416666666666668e-05, |
| "loss": 1.3458, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.0575389041454165, |
| "grad_norm": 0.6948599150905024, |
| "learning_rate": 1.1458333333333333e-05, |
| "loss": 1.335, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.06276971361318164, |
| "grad_norm": 0.651340813779911, |
| "learning_rate": 1.25e-05, |
| "loss": 1.322, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.06800052308094677, |
| "grad_norm": 0.684798803653097, |
| "learning_rate": 1.3541666666666668e-05, |
| "loss": 1.3456, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.07323133254871192, |
| "grad_norm": 0.6258722254574212, |
| "learning_rate": 1.4583333333333333e-05, |
| "loss": 1.317, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.07846214201647705, |
| "grad_norm": 0.661994715425898, |
| "learning_rate": 1.5625e-05, |
| "loss": 1.314, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.08369295148424219, |
| "grad_norm": 0.6157440945511082, |
| "learning_rate": 1.6666666666666667e-05, |
| "loss": 1.3174, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.08892376095200732, |
| "grad_norm": 0.6136457513656507, |
| "learning_rate": 1.7708333333333335e-05, |
| "loss": 1.3208, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.09415457041977246, |
| "grad_norm": 0.6609752901635255, |
| "learning_rate": 1.8750000000000002e-05, |
| "loss": 1.3217, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.0993853798875376, |
| "grad_norm": 0.6433338679997594, |
| "learning_rate": 1.979166666666667e-05, |
| "loss": 1.3209, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.10461618935530273, |
| "grad_norm": 0.748785028585811, |
| "learning_rate": 1.999892997072575e-05, |
| "loss": 1.3015, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.10984699882306787, |
| "grad_norm": 0.7348845509310983, |
| "learning_rate": 1.99945833692589e-05, |
| "loss": 1.3043, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.115077808290833, |
| "grad_norm": 0.7150167557908745, |
| "learning_rate": 1.9986894771071707e-05, |
| "loss": 1.3036, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.12030861775859815, |
| "grad_norm": 0.6229579295007591, |
| "learning_rate": 1.9975866747083734e-05, |
| "loss": 1.3046, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.12553942722636327, |
| "grad_norm": 0.626634171036282, |
| "learning_rate": 1.9961502984854394e-05, |
| "loss": 1.3011, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.1307702366941284, |
| "grad_norm": 0.6732748047847544, |
| "learning_rate": 1.9943808287349902e-05, |
| "loss": 1.2922, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.13600104616189354, |
| "grad_norm": 0.6336745846509011, |
| "learning_rate": 1.992278857133726e-05, |
| "loss": 1.3023, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.1412318556296587, |
| "grad_norm": 0.63287698051173, |
| "learning_rate": 1.9898450865405786e-05, |
| "loss": 1.2912, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.14646266509742384, |
| "grad_norm": 0.6243758841550209, |
| "learning_rate": 1.9870803307616916e-05, |
| "loss": 1.2953, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.15169347456518897, |
| "grad_norm": 0.7127519633361736, |
| "learning_rate": 1.983985514278296e-05, |
| "loss": 1.2955, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.1569242840329541, |
| "grad_norm": 0.6226333486523956, |
| "learning_rate": 1.9805616719375852e-05, |
| "loss": 1.2932, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.16215509350071924, |
| "grad_norm": 0.6823517100118245, |
| "learning_rate": 1.9768099486066776e-05, |
| "loss": 1.2745, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.16738590296848438, |
| "grad_norm": 0.6663881488848158, |
| "learning_rate": 1.9727315987897993e-05, |
| "loss": 1.3024, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.17261671243624951, |
| "grad_norm": 0.6250204764928626, |
| "learning_rate": 1.9683279862087986e-05, |
| "loss": 1.2725, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.17784752190401465, |
| "grad_norm": 0.6587024144174084, |
| "learning_rate": 1.963600583347147e-05, |
| "loss": 1.2883, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.18307833137177978, |
| "grad_norm": 0.6553453461637575, |
| "learning_rate": 1.9585509709575646e-05, |
| "loss": 1.2934, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.18830914083954492, |
| "grad_norm": 0.6612913681156035, |
| "learning_rate": 1.9531808375334512e-05, |
| "loss": 1.2663, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.19353995030731005, |
| "grad_norm": 0.6536121257096528, |
| "learning_rate": 1.9474919787442835e-05, |
| "loss": 1.2588, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.1987707597750752, |
| "grad_norm": 0.6498560967867221, |
| "learning_rate": 1.9414862968351788e-05, |
| "loss": 1.261, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.20400156924284032, |
| "grad_norm": 0.6437028941150709, |
| "learning_rate": 1.935165799990821e-05, |
| "loss": 1.2764, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.20923237871060546, |
| "grad_norm": 0.7042353774035648, |
| "learning_rate": 1.9285326016639624e-05, |
| "loss": 1.2715, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.2144631881783706, |
| "grad_norm": 0.6615769962934966, |
| "learning_rate": 1.9215889198687245e-05, |
| "loss": 1.2515, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.21969399764613573, |
| "grad_norm": 0.6674880822729611, |
| "learning_rate": 1.9143370764389374e-05, |
| "loss": 1.2828, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.22492480711390087, |
| "grad_norm": 0.6423252942504364, |
| "learning_rate": 1.906779496251763e-05, |
| "loss": 1.2791, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.230155616581666, |
| "grad_norm": 0.6391984125238104, |
| "learning_rate": 1.8989187064168643e-05, |
| "loss": 1.2519, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.23538642604943116, |
| "grad_norm": 0.6477151176504692, |
| "learning_rate": 1.8907573354313853e-05, |
| "loss": 1.2777, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.2406172355171963, |
| "grad_norm": 0.6269146787303694, |
| "learning_rate": 1.8822981123010343e-05, |
| "loss": 1.2736, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.24584804498496143, |
| "grad_norm": 0.6477481280746402, |
| "learning_rate": 1.873543865627556e-05, |
| "loss": 1.2732, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.25107885445272654, |
| "grad_norm": 0.6651047763900065, |
| "learning_rate": 1.8644975226629025e-05, |
| "loss": 1.2829, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.2563096639204917, |
| "grad_norm": 0.6432975261843621, |
| "learning_rate": 1.8551621083304147e-05, |
| "loss": 1.2615, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.2615404733882568, |
| "grad_norm": 0.6589800378988344, |
| "learning_rate": 1.8455407442133467e-05, |
| "loss": 1.2824, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.266771282856022, |
| "grad_norm": 0.6374519780745838, |
| "learning_rate": 1.8356366475110697e-05, |
| "loss": 1.2809, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.2720020923237871, |
| "grad_norm": 0.6474176000311428, |
| "learning_rate": 1.8254531299633007e-05, |
| "loss": 1.2765, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.27723290179155224, |
| "grad_norm": 0.611205619408698, |
| "learning_rate": 1.81499359674272e-05, |
| "loss": 1.2723, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.2824637112593174, |
| "grad_norm": 0.680714525048176, |
| "learning_rate": 1.8042615453163484e-05, |
| "loss": 1.2606, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.2876945207270825, |
| "grad_norm": 0.6405723682781137, |
| "learning_rate": 1.7932605642760607e-05, |
| "loss": 1.2495, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.2929253301948477, |
| "grad_norm": 0.6279226914713609, |
| "learning_rate": 1.7819943321386295e-05, |
| "loss": 1.2735, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.2981561396626128, |
| "grad_norm": 0.6284127404605249, |
| "learning_rate": 1.7704666161156994e-05, |
| "loss": 1.241, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.30338694913037795, |
| "grad_norm": 0.6499789986037162, |
| "learning_rate": 1.7586812708541046e-05, |
| "loss": 1.2607, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.30861775859814305, |
| "grad_norm": 0.6557734964553777, |
| "learning_rate": 1.746642237146948e-05, |
| "loss": 1.2746, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.3138485680659082, |
| "grad_norm": 0.6397583630089397, |
| "learning_rate": 1.7343535406158773e-05, |
| "loss": 1.2606, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.3190793775336733, |
| "grad_norm": 0.6893845618635109, |
| "learning_rate": 1.7218192903649926e-05, |
| "loss": 1.261, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.3243101870014385, |
| "grad_norm": 0.7201666359135361, |
| "learning_rate": 1.7090436776068422e-05, |
| "loss": 1.2713, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.3295409964692036, |
| "grad_norm": 0.6687526125254434, |
| "learning_rate": 1.6960309742609603e-05, |
| "loss": 1.2721, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.33477180593696876, |
| "grad_norm": 0.7677992466340372, |
| "learning_rate": 1.682785531525422e-05, |
| "loss": 1.2671, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.34000261540473387, |
| "grad_norm": 0.7054430863360542, |
| "learning_rate": 1.6693117784218818e-05, |
| "loss": 1.2512, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.34523342487249903, |
| "grad_norm": 0.7122244372275465, |
| "learning_rate": 1.655614220314598e-05, |
| "loss": 1.2625, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.35046423434026414, |
| "grad_norm": 0.679853303463346, |
| "learning_rate": 1.6416974374039227e-05, |
| "loss": 1.2767, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.3556950438080293, |
| "grad_norm": 0.6684546083293239, |
| "learning_rate": 1.6275660831947725e-05, |
| "loss": 1.253, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.3609258532757944, |
| "grad_norm": 0.6148491341875125, |
| "learning_rate": 1.6132248829405845e-05, |
| "loss": 1.2664, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.36615666274355957, |
| "grad_norm": 0.6287400163765243, |
| "learning_rate": 1.5986786320632842e-05, |
| "loss": 1.29, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.3713874722113247, |
| "grad_norm": 0.6429840333148633, |
| "learning_rate": 1.5839321945497847e-05, |
| "loss": 1.259, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.37661828167908984, |
| "grad_norm": 0.6213894267805022, |
| "learning_rate": 1.5689905013255683e-05, |
| "loss": 1.2721, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.381849091146855, |
| "grad_norm": 0.6657140337203303, |
| "learning_rate": 1.5538585486058747e-05, |
| "loss": 1.2629, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.3870799006146201, |
| "grad_norm": 0.6216486922374719, |
| "learning_rate": 1.5385413962250657e-05, |
| "loss": 1.2445, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.39231071008238527, |
| "grad_norm": 0.640435105766003, |
| "learning_rate": 1.5230441659447128e-05, |
| "loss": 1.2672, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.3975415195501504, |
| "grad_norm": 0.628008495827186, |
| "learning_rate": 1.507372039740978e-05, |
| "loss": 1.2782, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.40277232901791554, |
| "grad_norm": 0.6525051028721182, |
| "learning_rate": 1.4915302580718614e-05, |
| "loss": 1.249, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.40800313848568065, |
| "grad_norm": 0.6278258712485207, |
| "learning_rate": 1.4755241181248923e-05, |
| "loss": 1.2588, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.4132339479534458, |
| "grad_norm": 0.6550288396521164, |
| "learning_rate": 1.4593589720458507e-05, |
| "loss": 1.255, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.4184647574212109, |
| "grad_norm": 0.6820552488018682, |
| "learning_rate": 1.443040225149114e-05, |
| "loss": 1.2498, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.4236955668889761, |
| "grad_norm": 0.6399254481819071, |
| "learning_rate": 1.4265733341102235e-05, |
| "loss": 1.2666, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.4289263763567412, |
| "grad_norm": 0.6722624199607894, |
| "learning_rate": 1.4099638051412745e-05, |
| "loss": 1.245, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.43415718582450635, |
| "grad_norm": 0.6302145600803009, |
| "learning_rate": 1.3932171921497483e-05, |
| "loss": 1.2611, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.43938799529227146, |
| "grad_norm": 0.6031404765855133, |
| "learning_rate": 1.3763390948813897e-05, |
| "loss": 1.2372, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.4446188047600366, |
| "grad_norm": 0.646493114663968, |
| "learning_rate": 1.3593351570477608e-05, |
| "loss": 1.2426, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.44984961422780173, |
| "grad_norm": 0.6195059553793902, |
| "learning_rate": 1.3422110644390911e-05, |
| "loss": 1.2169, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.4550804236955669, |
| "grad_norm": 0.6012958936997463, |
| "learning_rate": 1.3249725430230595e-05, |
| "loss": 1.2728, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.460311233163332, |
| "grad_norm": 0.617177322353044, |
| "learning_rate": 1.3076253570301409e-05, |
| "loss": 1.2518, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.46554204263109716, |
| "grad_norm": 0.6292828261202444, |
| "learning_rate": 1.2901753070261565e-05, |
| "loss": 1.2543, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.4707728520988623, |
| "grad_norm": 0.597666355379697, |
| "learning_rate": 1.2726282279726788e-05, |
| "loss": 1.2427, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.47600366156662743, |
| "grad_norm": 0.6234161905971162, |
| "learning_rate": 1.2549899872759288e-05, |
| "loss": 1.2602, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.4812344710343926, |
| "grad_norm": 0.629762327137135, |
| "learning_rate": 1.237266482824832e-05, |
| "loss": 1.2428, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.4864652805021577, |
| "grad_norm": 0.6210086267634197, |
| "learning_rate": 1.2194636410188748e-05, |
| "loss": 1.2281, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.49169608996992287, |
| "grad_norm": 0.6456911580326817, |
| "learning_rate": 1.2015874147864314e-05, |
| "loss": 1.2342, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.496926899437688, |
| "grad_norm": 0.6472528263693088, |
| "learning_rate": 1.183643781594219e-05, |
| "loss": 1.2427, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.5021577089054531, |
| "grad_norm": 0.647823531177134, |
| "learning_rate": 1.165638741448548e-05, |
| "loss": 1.2522, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.5073885183732183, |
| "grad_norm": 0.6693118299281153, |
| "learning_rate": 1.147578314889033e-05, |
| "loss": 1.2619, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.5126193278409834, |
| "grad_norm": 0.667730694230485, |
| "learning_rate": 1.1294685409754434e-05, |
| "loss": 1.2429, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.5178501373087485, |
| "grad_norm": 0.6046644310580812, |
| "learning_rate": 1.1113154752683548e-05, |
| "loss": 1.2588, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.5230809467765136, |
| "grad_norm": 0.6785327532515957, |
| "learning_rate": 1.0931251878042882e-05, |
| "loss": 1.2673, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.5283117562442788, |
| "grad_norm": 0.6252703595295017, |
| "learning_rate": 1.0749037610660041e-05, |
| "loss": 1.264, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.533542565712044, |
| "grad_norm": 0.6079580441878377, |
| "learning_rate": 1.0566572879486388e-05, |
| "loss": 1.2619, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.538773375179809, |
| "grad_norm": 0.6213036767388472, |
| "learning_rate": 1.0383918697223564e-05, |
| "loss": 1.2602, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.5440041846475742, |
| "grad_norm": 0.5992653066437161, |
| "learning_rate": 1.020113613992203e-05, |
| "loss": 1.2636, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.5492349941153394, |
| "grad_norm": 0.5983825011403769, |
| "learning_rate": 1.001828632655837e-05, |
| "loss": 1.254, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.5544658035831045, |
| "grad_norm": 0.6237947953829833, |
| "learning_rate": 9.835430398598319e-06, |
| "loss": 1.2573, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.5596966130508696, |
| "grad_norm": 0.6015544745764206, |
| "learning_rate": 9.652629499552216e-06, |
| "loss": 1.2332, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.5649274225186348, |
| "grad_norm": 0.6724219499123357, |
| "learning_rate": 9.469944754529784e-06, |
| "loss": 1.2489, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.5701582319863999, |
| "grad_norm": 0.5900962511293482, |
| "learning_rate": 9.28743724980107e-06, |
| "loss": 1.2486, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.575389041454165, |
| "grad_norm": 0.5939356137294083, |
| "learning_rate": 9.105168012370372e-06, |
| "loss": 1.2464, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.5806198509219301, |
| "grad_norm": 0.6072005219993765, |
| "learning_rate": 8.923197989569981e-06, |
| "loss": 1.2484, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.5858506603896954, |
| "grad_norm": 0.6408535640602984, |
| "learning_rate": 8.741588028680566e-06, |
| "loss": 1.2577, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.5910814698574605, |
| "grad_norm": 0.5820294502690248, |
| "learning_rate": 8.560398856585002e-06, |
| "loss": 1.2519, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.5963122793252256, |
| "grad_norm": 0.5866823280811903, |
| "learning_rate": 8.379691059462478e-06, |
| "loss": 1.2373, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.6015430887929907, |
| "grad_norm": 0.6268376105145648, |
| "learning_rate": 8.199525062529626e-06, |
| "loss": 1.261, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.6067738982607559, |
| "grad_norm": 0.5899507751154045, |
| "learning_rate": 8.01996110983552e-06, |
| "loss": 1.2469, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.612004707728521, |
| "grad_norm": 0.6110501827689294, |
| "learning_rate": 7.841059244117189e-06, |
| "loss": 1.24, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.6172355171962861, |
| "grad_norm": 0.628795675438309, |
| "learning_rate": 7.662879286722496e-06, |
| "loss": 1.2678, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.6224663266640512, |
| "grad_norm": 0.5817062103966367, |
| "learning_rate": 7.485480817607031e-06, |
| "loss": 1.2384, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.6276971361318164, |
| "grad_norm": 0.5950726911519293, |
| "learning_rate": 7.30892315541171e-06, |
| "loss": 1.2356, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.6329279455995815, |
| "grad_norm": 0.5727952261916434, |
| "learning_rate": 7.133265337627757e-06, |
| "loss": 1.2406, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.6381587550673467, |
| "grad_norm": 0.575271616339552, |
| "learning_rate": 6.958566100855716e-06, |
| "loss": 1.2437, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.6433895645351118, |
| "grad_norm": 0.5790390123089401, |
| "learning_rate": 6.78488386116505e-06, |
| "loss": 1.2583, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.648620374002877, |
| "grad_norm": 0.5932077205106131, |
| "learning_rate": 6.612276694560927e-06, |
| "loss": 1.2413, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.6538511834706421, |
| "grad_norm": 0.5947522152199447, |
| "learning_rate": 6.44080231756473e-06, |
| "loss": 1.2549, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.6590819929384072, |
| "grad_norm": 1.3459652842247485, |
| "learning_rate": 6.2705180679147455e-06, |
| "loss": 1.2387, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.6643128024061724, |
| "grad_norm": 0.7269750930986757, |
| "learning_rate": 6.101480885393537e-06, |
| "loss": 1.2315, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.6695436118739375, |
| "grad_norm": 0.5957998707048122, |
| "learning_rate": 5.933747292788369e-06, |
| "loss": 1.2677, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.6747744213417026, |
| "grad_norm": 0.5850290378596165, |
| "learning_rate": 5.767373376991082e-06, |
| "loss": 1.2291, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.6800052308094677, |
| "grad_norm": 0.5844027863277863, |
| "learning_rate": 5.602414770243698e-06, |
| "loss": 1.2273, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.685236040277233, |
| "grad_norm": 0.5868679965624024, |
| "learning_rate": 5.438926631536087e-06, |
| "loss": 1.2479, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.6904668497449981, |
| "grad_norm": 0.5831449921364225, |
| "learning_rate": 5.276963628161833e-06, |
| "loss": 1.247, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.6956976592127632, |
| "grad_norm": 0.5755759622405301, |
| "learning_rate": 5.116579917438564e-06, |
| "loss": 1.2447, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.7009284686805283, |
| "grad_norm": 0.5768540427743302, |
| "learning_rate": 4.957829128598781e-06, |
| "loss": 1.2177, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.7061592781482935, |
| "grad_norm": 0.598911774868325, |
| "learning_rate": 4.80076434485727e-06, |
| "loss": 1.2321, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.7113900876160586, |
| "grad_norm": 0.5969706523304447, |
| "learning_rate": 4.645438085661085e-06, |
| "loss": 1.234, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.7166208970838237, |
| "grad_norm": 0.5866245773773069, |
| "learning_rate": 4.4919022891280725e-06, |
| "loss": 1.2456, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.7218517065515888, |
| "grad_norm": 0.5811720592986388, |
| "learning_rate": 4.340208294679745e-06, |
| "loss": 1.245, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.727082516019354, |
| "grad_norm": 0.5849016337714122, |
| "learning_rate": 4.190406825874377e-06, |
| "loss": 1.2212, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.7323133254871191, |
| "grad_norm": 0.6255138841124418, |
| "learning_rate": 4.042547973446017e-06, |
| "loss": 1.2318, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.7375441349548842, |
| "grad_norm": 0.5808819882391798, |
| "learning_rate": 3.896681178555099e-06, |
| "loss": 1.2616, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.7427749444226494, |
| "grad_norm": 0.5821049889121402, |
| "learning_rate": 3.7528552162562858e-06, |
| "loss": 1.2342, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.7480057538904146, |
| "grad_norm": 0.6008223090596293, |
| "learning_rate": 3.6111181791890184e-06, |
| "loss": 1.2472, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.7532365633581797, |
| "grad_norm": 0.5950331976607752, |
| "learning_rate": 3.471517461496253e-06, |
| "loss": 1.2211, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.7584673728259448, |
| "grad_norm": 0.5695566788307966, |
| "learning_rate": 3.3340997429767786e-06, |
| "loss": 1.2397, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.76369818229371, |
| "grad_norm": 0.5922108634556766, |
| "learning_rate": 3.1989109734763936e-06, |
| "loss": 1.2624, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.7689289917614751, |
| "grad_norm": 0.573161649411567, |
| "learning_rate": 3.0659963575231544e-06, |
| "loss": 1.25, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.7741598012292402, |
| "grad_norm": 0.5745082815289888, |
| "learning_rate": 2.935400339211841e-06, |
| "loss": 1.2564, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.7793906106970053, |
| "grad_norm": 0.5599070185795609, |
| "learning_rate": 2.8071665873427244e-06, |
| "loss": 1.2341, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.7846214201647705, |
| "grad_norm": 0.5798846902278265, |
| "learning_rate": 2.681337980819536e-06, |
| "loss": 1.2425, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.7898522296325357, |
| "grad_norm": 0.5630366958265783, |
| "learning_rate": 2.5579565943116092e-06, |
| "loss": 1.2503, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.7950830391003008, |
| "grad_norm": 0.5612114479196867, |
| "learning_rate": 2.437063684184893e-06, |
| "loss": 1.2372, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.8003138485680659, |
| "grad_norm": 0.5786482688071404, |
| "learning_rate": 2.318699674706639e-06, |
| "loss": 1.2539, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.8055446580358311, |
| "grad_norm": 0.5609594448423239, |
| "learning_rate": 2.202904144528295e-06, |
| "loss": 1.2359, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.8107754675035962, |
| "grad_norm": 0.5761081087482597, |
| "learning_rate": 2.08971581345115e-06, |
| "loss": 1.2333, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.8160062769713613, |
| "grad_norm": 0.5773902237533778, |
| "learning_rate": 1.979172529479193e-06, |
| "loss": 1.2405, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.8212370864391264, |
| "grad_norm": 0.5692272061773305, |
| "learning_rate": 1.8713112561634671e-06, |
| "loss": 1.2531, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.8264678959068916, |
| "grad_norm": 0.8502635478696626, |
| "learning_rate": 1.7661680602421594e-06, |
| "loss": 1.2357, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.8316987053746567, |
| "grad_norm": 0.5725480612758482, |
| "learning_rate": 1.663778099580583e-06, |
| "loss": 1.231, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.8369295148424218, |
| "grad_norm": 0.5528630417115946, |
| "learning_rate": 1.5641756114150552e-06, |
| "loss": 1.2412, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.842160324310187, |
| "grad_norm": 0.5462565230167501, |
| "learning_rate": 1.4673939009046268e-06, |
| "loss": 1.2233, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.8473911337779522, |
| "grad_norm": 0.5663269502388393, |
| "learning_rate": 1.3734653299944834e-06, |
| "loss": 1.2277, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.8526219432457173, |
| "grad_norm": 0.5686962762287001, |
| "learning_rate": 1.2824213065947232e-06, |
| "loss": 1.2357, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.8578527527134824, |
| "grad_norm": 0.5638758372127466, |
| "learning_rate": 1.194292274078156e-06, |
| "loss": 1.2377, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.8630835621812476, |
| "grad_norm": 0.5586378869453262, |
| "learning_rate": 1.1091077011006302e-06, |
| "loss": 1.2223, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.8683143716490127, |
| "grad_norm": 0.5517524656609795, |
| "learning_rate": 1.0268960717472742e-06, |
| "loss": 1.2574, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.8735451811167778, |
| "grad_norm": 0.5569662486368889, |
| "learning_rate": 9.476848760079671e-07, |
| "loss": 1.2335, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.8787759905845429, |
| "grad_norm": 0.5643447607687533, |
| "learning_rate": 8.715006005852144e-07, |
| "loss": 1.2304, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.8840068000523081, |
| "grad_norm": 0.550747548292025, |
| "learning_rate": 7.983687200375046e-07, |
| "loss": 1.2281, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.8892376095200732, |
| "grad_norm": 0.5574658998449357, |
| "learning_rate": 7.283136882611063e-07, |
| "loss": 1.2329, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.8944684189878384, |
| "grad_norm": 0.5737161178123568, |
| "learning_rate": 6.613589303131506e-07, |
| "loss": 1.2465, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.8996992284556035, |
| "grad_norm": 0.5738100002167115, |
| "learning_rate": 5.975268345787455e-07, |
| "loss": 1.2408, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.9049300379233687, |
| "grad_norm": 0.5542103705854088, |
| "learning_rate": 5.368387452847312e-07, |
| "loss": 1.2369, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.9101608473911338, |
| "grad_norm": 0.5509438118396119, |
| "learning_rate": 4.793149553625786e-07, |
| "loss": 1.2677, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.9153916568588989, |
| "grad_norm": 0.5573824119903792, |
| "learning_rate": 4.2497469966282125e-07, |
| "loss": 1.2429, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.920622466326664, |
| "grad_norm": 0.5438647452016272, |
| "learning_rate": 3.738361485232922e-07, |
| "loss": 1.2236, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.9258532757944292, |
| "grad_norm": 0.5597553842668248, |
| "learning_rate": 3.2591640169331697e-07, |
| "loss": 1.2443, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.9310840852621943, |
| "grad_norm": 0.5541351488188309, |
| "learning_rate": 2.8123148261587465e-07, |
| "loss": 1.2292, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.9363148947299594, |
| "grad_norm": 0.5510390666909667, |
| "learning_rate": 2.397963330696751e-07, |
| "loss": 1.2371, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.9415457041977247, |
| "grad_norm": 0.5662372990839701, |
| "learning_rate": 2.0162480817291442e-07, |
| "loss": 1.2385, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.9467765136654898, |
| "grad_norm": 0.5633092432947229, |
| "learning_rate": 1.6672967175038634e-07, |
| "loss": 1.2357, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.9520073231332549, |
| "grad_norm": 0.5485802540619047, |
| "learning_rate": 1.3512259206550748e-07, |
| "loss": 1.241, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.95723813260102, |
| "grad_norm": 0.5451187400528298, |
| "learning_rate": 1.0681413791867157e-07, |
| "loss": 1.2331, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.9624689420687852, |
| "grad_norm": 0.5544164676125732, |
| "learning_rate": 8.181377511324306e-08, |
| "loss": 1.2486, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.9676997515365503, |
| "grad_norm": 0.5773845105032677, |
| "learning_rate": 6.012986329038462e-08, |
| "loss": 1.2386, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.9729305610043154, |
| "grad_norm": 0.5478843295079262, |
| "learning_rate": 4.1769653133743036e-08, |
| "loss": 1.215, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.9781613704720805, |
| "grad_norm": 0.5608458529460807, |
| "learning_rate": 2.673928394496206e-08, |
| "loss": 1.2301, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.9833921799398457, |
| "grad_norm": 0.5519105435517161, |
| "learning_rate": 1.5043781590823313e-08, |
| "loss": 1.2173, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.9886229894076108, |
| "grad_norm": 0.5536445777152418, |
| "learning_rate": 6.687056822688442e-09, |
| "loss": 1.2308, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.993853798875376, |
| "grad_norm": 0.5686300705360647, |
| "learning_rate": 1.6719039688162242e-09, |
| "loss": 1.2192, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.9990846083431411, |
| "grad_norm": 0.5671702033849003, |
| "learning_rate": 0.0, |
| "loss": 1.24, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.9990846083431411, |
| "step": 955, |
| "total_flos": 522931729858560.0, |
| "train_loss": 1.2670168572071334, |
| "train_runtime": 18760.2941, |
| "train_samples_per_second": 6.522, |
| "train_steps_per_second": 0.051 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 955, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "total_flos": 522931729858560.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|