{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9994000299985002, "eval_steps": 500, "global_step": 4444, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00044997750112494374, "grad_norm": 4.529098245180452, "learning_rate": 2.2471910112359554e-08, "loss": 2.198, "step": 1 }, { "epoch": 0.0022498875056247186, "grad_norm": 4.089353425911041, "learning_rate": 1.1235955056179776e-07, "loss": 2.1345, "step": 5 }, { "epoch": 0.004499775011249437, "grad_norm": 3.871692712739612, "learning_rate": 2.247191011235955e-07, "loss": 2.1219, "step": 10 }, { "epoch": 0.006749662516874156, "grad_norm": 3.48717161147707, "learning_rate": 3.3707865168539325e-07, "loss": 2.1269, "step": 15 }, { "epoch": 0.008999550022498875, "grad_norm": 2.757793504494978, "learning_rate": 4.49438202247191e-07, "loss": 1.9716, "step": 20 }, { "epoch": 0.011249437528123594, "grad_norm": 1.888825489292899, "learning_rate": 5.617977528089888e-07, "loss": 1.8304, "step": 25 }, { "epoch": 0.013499325033748313, "grad_norm": 1.775898351624132, "learning_rate": 6.741573033707865e-07, "loss": 1.5734, "step": 30 }, { "epoch": 0.01574921253937303, "grad_norm": 1.3767992538309424, "learning_rate": 7.865168539325843e-07, "loss": 1.3449, "step": 35 }, { "epoch": 0.01799910004499775, "grad_norm": 0.640753938304731, "learning_rate": 8.98876404494382e-07, "loss": 1.033, "step": 40 }, { "epoch": 0.020248987550622467, "grad_norm": 0.5235162453528913, "learning_rate": 1.01123595505618e-06, "loss": 0.8949, "step": 45 }, { "epoch": 0.02249887505624719, "grad_norm": 0.5107836396937878, "learning_rate": 1.1235955056179777e-06, "loss": 0.8114, "step": 50 }, { "epoch": 0.024748762561871907, "grad_norm": 0.4706639289305382, "learning_rate": 1.2359550561797752e-06, "loss": 0.7441, "step": 55 }, { "epoch": 0.026998650067496625, "grad_norm": 0.40589783510545635, "learning_rate": 1.348314606741573e-06, "loss": 0.69, "step": 60 }, { "epoch": 0.029248537573121344, "grad_norm": 0.35422872028585745, "learning_rate": 1.4606741573033708e-06, "loss": 0.6444, "step": 65 }, { "epoch": 0.03149842507874606, "grad_norm": 0.29411265606808407, "learning_rate": 1.5730337078651686e-06, "loss": 0.6228, "step": 70 }, { "epoch": 0.03374831258437078, "grad_norm": 0.26693653586148297, "learning_rate": 1.6853932584269663e-06, "loss": 0.5582, "step": 75 }, { "epoch": 0.0359982000899955, "grad_norm": 0.2681533501424792, "learning_rate": 1.797752808988764e-06, "loss": 0.556, "step": 80 }, { "epoch": 0.03824808759562022, "grad_norm": 0.25571380427813706, "learning_rate": 1.910112359550562e-06, "loss": 0.5195, "step": 85 }, { "epoch": 0.040497975101244935, "grad_norm": 0.2711200571248154, "learning_rate": 2.02247191011236e-06, "loss": 0.5143, "step": 90 }, { "epoch": 0.042747862606869656, "grad_norm": 0.24950121635592964, "learning_rate": 2.1348314606741574e-06, "loss": 0.494, "step": 95 }, { "epoch": 0.04499775011249438, "grad_norm": 0.27150452317695684, "learning_rate": 2.2471910112359554e-06, "loss": 0.4789, "step": 100 }, { "epoch": 0.04724763761811909, "grad_norm": 0.28474082113260213, "learning_rate": 2.359550561797753e-06, "loss": 0.4491, "step": 105 }, { "epoch": 0.049497525123743814, "grad_norm": 0.3015594402280247, "learning_rate": 2.4719101123595505e-06, "loss": 0.4403, "step": 110 }, { "epoch": 0.05174741262936853, "grad_norm": 0.30222867324963326, "learning_rate": 2.584269662921349e-06, "loss": 0.4484, "step": 115 }, { "epoch": 0.05399730013499325, "grad_norm": 0.273012018841968, "learning_rate": 2.696629213483146e-06, "loss": 0.4261, "step": 120 }, { "epoch": 0.05624718764061797, "grad_norm": 0.26965584672677667, "learning_rate": 2.8089887640449444e-06, "loss": 0.4322, "step": 125 }, { "epoch": 0.05849707514624269, "grad_norm": 0.26746073049580904, "learning_rate": 2.9213483146067416e-06, "loss": 0.4639, "step": 130 }, { "epoch": 0.06074696265186741, "grad_norm": 0.23633644420426006, "learning_rate": 3.03370786516854e-06, "loss": 0.4599, "step": 135 }, { "epoch": 0.06299685015749212, "grad_norm": 0.23444605189207685, "learning_rate": 3.146067415730337e-06, "loss": 0.3872, "step": 140 }, { "epoch": 0.06524673766311684, "grad_norm": 0.2498458749808252, "learning_rate": 3.258426966292135e-06, "loss": 0.4033, "step": 145 }, { "epoch": 0.06749662516874157, "grad_norm": 0.20527616283239267, "learning_rate": 3.3707865168539327e-06, "loss": 0.4029, "step": 150 }, { "epoch": 0.06974651267436628, "grad_norm": 0.1929264923191392, "learning_rate": 3.4831460674157306e-06, "loss": 0.3972, "step": 155 }, { "epoch": 0.071996400179991, "grad_norm": 0.2053416646605216, "learning_rate": 3.595505617977528e-06, "loss": 0.4048, "step": 160 }, { "epoch": 0.07424628768561573, "grad_norm": 0.21703322812039422, "learning_rate": 3.707865168539326e-06, "loss": 0.4141, "step": 165 }, { "epoch": 0.07649617519124044, "grad_norm": 0.1891382535207117, "learning_rate": 3.820224719101124e-06, "loss": 0.4136, "step": 170 }, { "epoch": 0.07874606269686515, "grad_norm": 0.147554774836894, "learning_rate": 3.932584269662922e-06, "loss": 0.375, "step": 175 }, { "epoch": 0.08099595020248987, "grad_norm": 0.17905729415958477, "learning_rate": 4.04494382022472e-06, "loss": 0.4017, "step": 180 }, { "epoch": 0.0832458377081146, "grad_norm": 0.1505640721404514, "learning_rate": 4.157303370786518e-06, "loss": 0.3805, "step": 185 }, { "epoch": 0.08549572521373931, "grad_norm": 0.1583375307295741, "learning_rate": 4.269662921348315e-06, "loss": 0.3782, "step": 190 }, { "epoch": 0.08774561271936403, "grad_norm": 0.18866183228882213, "learning_rate": 4.382022471910113e-06, "loss": 0.4021, "step": 195 }, { "epoch": 0.08999550022498876, "grad_norm": 0.15247202386740694, "learning_rate": 4.494382022471911e-06, "loss": 0.3601, "step": 200 }, { "epoch": 0.09224538773061347, "grad_norm": 0.15750805465065473, "learning_rate": 4.606741573033709e-06, "loss": 0.3642, "step": 205 }, { "epoch": 0.09449527523623819, "grad_norm": 0.18964645355569257, "learning_rate": 4.719101123595506e-06, "loss": 0.381, "step": 210 }, { "epoch": 0.0967451627418629, "grad_norm": 0.21688318162459963, "learning_rate": 4.831460674157304e-06, "loss": 0.3936, "step": 215 }, { "epoch": 0.09899505024748763, "grad_norm": 0.1626408315890611, "learning_rate": 4.943820224719101e-06, "loss": 0.3836, "step": 220 }, { "epoch": 0.10124493775311234, "grad_norm": 0.15324258338477442, "learning_rate": 5.0561797752809e-06, "loss": 0.3772, "step": 225 }, { "epoch": 0.10349482525873706, "grad_norm": 0.14183725887256435, "learning_rate": 5.168539325842698e-06, "loss": 0.372, "step": 230 }, { "epoch": 0.10574471276436179, "grad_norm": 0.16218789948449905, "learning_rate": 5.280898876404494e-06, "loss": 0.3571, "step": 235 }, { "epoch": 0.1079946002699865, "grad_norm": 0.1542954767008243, "learning_rate": 5.393258426966292e-06, "loss": 0.3773, "step": 240 }, { "epoch": 0.11024448777561122, "grad_norm": 0.1480650113941815, "learning_rate": 5.50561797752809e-06, "loss": 0.3679, "step": 245 }, { "epoch": 0.11249437528123594, "grad_norm": 0.14762694498356746, "learning_rate": 5.617977528089889e-06, "loss": 0.3766, "step": 250 }, { "epoch": 0.11474426278686066, "grad_norm": 0.13168710830003866, "learning_rate": 5.730337078651685e-06, "loss": 0.3563, "step": 255 }, { "epoch": 0.11699415029248537, "grad_norm": 0.12994674295453762, "learning_rate": 5.842696629213483e-06, "loss": 0.3835, "step": 260 }, { "epoch": 0.11924403779811009, "grad_norm": 0.15322453794019458, "learning_rate": 5.955056179775281e-06, "loss": 0.3523, "step": 265 }, { "epoch": 0.12149392530373482, "grad_norm": 0.14048350395524586, "learning_rate": 6.06741573033708e-06, "loss": 0.3667, "step": 270 }, { "epoch": 0.12374381280935953, "grad_norm": 0.12596877364547304, "learning_rate": 6.179775280898876e-06, "loss": 0.3675, "step": 275 }, { "epoch": 0.12599370031498425, "grad_norm": 0.14354283317022593, "learning_rate": 6.292134831460674e-06, "loss": 0.4005, "step": 280 }, { "epoch": 0.12824358782060896, "grad_norm": 0.11630361105168144, "learning_rate": 6.404494382022472e-06, "loss": 0.3747, "step": 285 }, { "epoch": 0.13049347532623368, "grad_norm": 0.13228040085041334, "learning_rate": 6.51685393258427e-06, "loss": 0.4077, "step": 290 }, { "epoch": 0.13274336283185842, "grad_norm": 0.14738366545686615, "learning_rate": 6.629213483146067e-06, "loss": 0.3998, "step": 295 }, { "epoch": 0.13499325033748313, "grad_norm": 0.12791384531800185, "learning_rate": 6.741573033707865e-06, "loss": 0.3964, "step": 300 }, { "epoch": 0.13724313784310785, "grad_norm": 0.14292143339244684, "learning_rate": 6.853932584269663e-06, "loss": 0.3665, "step": 305 }, { "epoch": 0.13949302534873256, "grad_norm": 0.13259028515993362, "learning_rate": 6.966292134831461e-06, "loss": 0.3648, "step": 310 }, { "epoch": 0.14174291285435728, "grad_norm": 0.1292871978326336, "learning_rate": 7.078651685393258e-06, "loss": 0.3729, "step": 315 }, { "epoch": 0.143992800359982, "grad_norm": 0.11157572176169035, "learning_rate": 7.191011235955056e-06, "loss": 0.3746, "step": 320 }, { "epoch": 0.1462426878656067, "grad_norm": 0.13354661592196232, "learning_rate": 7.303370786516854e-06, "loss": 0.3808, "step": 325 }, { "epoch": 0.14849257537123145, "grad_norm": 0.1306015247259671, "learning_rate": 7.415730337078652e-06, "loss": 0.3581, "step": 330 }, { "epoch": 0.15074246287685616, "grad_norm": 0.14070851299418752, "learning_rate": 7.5280898876404495e-06, "loss": 0.3687, "step": 335 }, { "epoch": 0.15299235038248088, "grad_norm": 0.12379734686286958, "learning_rate": 7.640449438202247e-06, "loss": 0.3663, "step": 340 }, { "epoch": 0.1552422378881056, "grad_norm": 0.10751947723272416, "learning_rate": 7.752808988764046e-06, "loss": 0.3852, "step": 345 }, { "epoch": 0.1574921253937303, "grad_norm": 0.1540237761883997, "learning_rate": 7.865168539325843e-06, "loss": 0.3717, "step": 350 }, { "epoch": 0.15974201289935502, "grad_norm": 0.11180120270561197, "learning_rate": 7.97752808988764e-06, "loss": 0.3489, "step": 355 }, { "epoch": 0.16199190040497974, "grad_norm": 0.12094951392614195, "learning_rate": 8.08988764044944e-06, "loss": 0.3732, "step": 360 }, { "epoch": 0.16424178791060448, "grad_norm": 0.09738867753978302, "learning_rate": 8.202247191011237e-06, "loss": 0.377, "step": 365 }, { "epoch": 0.1664916754162292, "grad_norm": 0.19496877057584272, "learning_rate": 8.314606741573035e-06, "loss": 0.3804, "step": 370 }, { "epoch": 0.1687415629218539, "grad_norm": 0.13099392604853372, "learning_rate": 8.426966292134832e-06, "loss": 0.3853, "step": 375 }, { "epoch": 0.17099145042747863, "grad_norm": 0.0929654538179481, "learning_rate": 8.53932584269663e-06, "loss": 0.3532, "step": 380 }, { "epoch": 0.17324133793310334, "grad_norm": 0.12166228073135968, "learning_rate": 8.651685393258428e-06, "loss": 0.3629, "step": 385 }, { "epoch": 0.17549122543872805, "grad_norm": 0.10261562058829776, "learning_rate": 8.764044943820226e-06, "loss": 0.3662, "step": 390 }, { "epoch": 0.17774111294435277, "grad_norm": 0.11229652632991165, "learning_rate": 8.876404494382023e-06, "loss": 0.366, "step": 395 }, { "epoch": 0.1799910004499775, "grad_norm": 0.11438088125544643, "learning_rate": 8.988764044943822e-06, "loss": 0.3766, "step": 400 }, { "epoch": 0.18224088795560223, "grad_norm": 0.11100159715996023, "learning_rate": 9.101123595505619e-06, "loss": 0.3149, "step": 405 }, { "epoch": 0.18449077546122694, "grad_norm": 0.09215304432977561, "learning_rate": 9.213483146067417e-06, "loss": 0.3505, "step": 410 }, { "epoch": 0.18674066296685166, "grad_norm": 0.11027407366862503, "learning_rate": 9.325842696629213e-06, "loss": 0.3239, "step": 415 }, { "epoch": 0.18899055047247637, "grad_norm": 0.10591097871093506, "learning_rate": 9.438202247191012e-06, "loss": 0.3763, "step": 420 }, { "epoch": 0.19124043797810109, "grad_norm": 0.12014034899424982, "learning_rate": 9.55056179775281e-06, "loss": 0.387, "step": 425 }, { "epoch": 0.1934903254837258, "grad_norm": 0.09470189013776839, "learning_rate": 9.662921348314608e-06, "loss": 0.379, "step": 430 }, { "epoch": 0.19574021298935054, "grad_norm": 0.09501092189291689, "learning_rate": 9.775280898876405e-06, "loss": 0.3642, "step": 435 }, { "epoch": 0.19799010049497526, "grad_norm": 0.11021236791362751, "learning_rate": 9.887640449438202e-06, "loss": 0.353, "step": 440 }, { "epoch": 0.20023998800059997, "grad_norm": 0.10184537613196046, "learning_rate": 1e-05, "loss": 0.365, "step": 445 }, { "epoch": 0.2024898755062247, "grad_norm": 0.08309643827704467, "learning_rate": 9.999961427623602e-06, "loss": 0.3575, "step": 450 }, { "epoch": 0.2047397630118494, "grad_norm": 0.10037722707892065, "learning_rate": 9.999845711089533e-06, "loss": 0.3471, "step": 455 }, { "epoch": 0.20698965051747412, "grad_norm": 0.11715533210340626, "learning_rate": 9.999652852183184e-06, "loss": 0.3714, "step": 460 }, { "epoch": 0.20923953802309886, "grad_norm": 0.13060134335041548, "learning_rate": 9.99938285388016e-06, "loss": 0.3635, "step": 465 }, { "epoch": 0.21148942552872357, "grad_norm": 0.08745200641347264, "learning_rate": 9.999035720346254e-06, "loss": 0.3571, "step": 470 }, { "epoch": 0.2137393130343483, "grad_norm": 0.08966989980740278, "learning_rate": 9.998611456937373e-06, "loss": 0.3639, "step": 475 }, { "epoch": 0.215989200539973, "grad_norm": 0.09956344072812717, "learning_rate": 9.998110070199454e-06, "loss": 0.3665, "step": 480 }, { "epoch": 0.21823908804559772, "grad_norm": 0.09656500042174225, "learning_rate": 9.997531567868367e-06, "loss": 0.3726, "step": 485 }, { "epoch": 0.22048897555122243, "grad_norm": 0.11853509152316483, "learning_rate": 9.996875958869803e-06, "loss": 0.3518, "step": 490 }, { "epoch": 0.22273886305684715, "grad_norm": 0.10059970891055807, "learning_rate": 9.996143253319113e-06, "loss": 0.3624, "step": 495 }, { "epoch": 0.2249887505624719, "grad_norm": 0.10793939615766127, "learning_rate": 9.995333462521178e-06, "loss": 0.3654, "step": 500 }, { "epoch": 0.2272386380680966, "grad_norm": 0.10450237535025396, "learning_rate": 9.99444659897022e-06, "loss": 0.3663, "step": 505 }, { "epoch": 0.22948852557372132, "grad_norm": 0.09806526685924745, "learning_rate": 9.993482676349612e-06, "loss": 0.342, "step": 510 }, { "epoch": 0.23173841307934603, "grad_norm": 0.0964872964969597, "learning_rate": 9.992441709531671e-06, "loss": 0.3705, "step": 515 }, { "epoch": 0.23398830058497075, "grad_norm": 0.09412389153440821, "learning_rate": 9.991323714577421e-06, "loss": 0.3541, "step": 520 }, { "epoch": 0.23623818809059546, "grad_norm": 0.08953556655443609, "learning_rate": 9.99012870873635e-06, "loss": 0.3521, "step": 525 }, { "epoch": 0.23848807559622018, "grad_norm": 0.08695345930804899, "learning_rate": 9.988856710446143e-06, "loss": 0.3505, "step": 530 }, { "epoch": 0.24073796310184492, "grad_norm": 0.09543515501943514, "learning_rate": 9.987507739332401e-06, "loss": 0.3766, "step": 535 }, { "epoch": 0.24298785060746964, "grad_norm": 0.08926010106293578, "learning_rate": 9.986081816208333e-06, "loss": 0.329, "step": 540 }, { "epoch": 0.24523773811309435, "grad_norm": 0.08598056512962657, "learning_rate": 9.984578963074436e-06, "loss": 0.3617, "step": 545 }, { "epoch": 0.24748762561871906, "grad_norm": 0.08952811194064599, "learning_rate": 9.982999203118153e-06, "loss": 0.3383, "step": 550 }, { "epoch": 0.24973751312434378, "grad_norm": 0.12658781787185433, "learning_rate": 9.981342560713528e-06, "loss": 0.3238, "step": 555 }, { "epoch": 0.2519874006299685, "grad_norm": 0.09353150867243243, "learning_rate": 9.979609061420812e-06, "loss": 0.3545, "step": 560 }, { "epoch": 0.2542372881355932, "grad_norm": 0.09177651257435882, "learning_rate": 9.977798731986079e-06, "loss": 0.3502, "step": 565 }, { "epoch": 0.2564871756412179, "grad_norm": 0.09932686671141468, "learning_rate": 9.975911600340814e-06, "loss": 0.3468, "step": 570 }, { "epoch": 0.25873706314684264, "grad_norm": 0.08192603238892632, "learning_rate": 9.973947695601477e-06, "loss": 0.3324, "step": 575 }, { "epoch": 0.26098695065246735, "grad_norm": 0.07493334234921131, "learning_rate": 9.971907048069058e-06, "loss": 0.3795, "step": 580 }, { "epoch": 0.26323683815809207, "grad_norm": 0.08138918761115761, "learning_rate": 9.969789689228606e-06, "loss": 0.3385, "step": 585 }, { "epoch": 0.26548672566371684, "grad_norm": 0.08838566706763232, "learning_rate": 9.967595651748745e-06, "loss": 0.369, "step": 590 }, { "epoch": 0.26773661316934155, "grad_norm": 0.08784958596018687, "learning_rate": 9.965324969481172e-06, "loss": 0.3169, "step": 595 }, { "epoch": 0.26998650067496627, "grad_norm": 0.0896168468240925, "learning_rate": 9.962977677460132e-06, "loss": 0.3572, "step": 600 }, { "epoch": 0.272236388180591, "grad_norm": 0.0946662419061461, "learning_rate": 9.960553811901879e-06, "loss": 0.385, "step": 605 }, { "epoch": 0.2744862756862157, "grad_norm": 0.12115400639084788, "learning_rate": 9.95805341020411e-06, "loss": 0.3595, "step": 610 }, { "epoch": 0.2767361631918404, "grad_norm": 0.0997799833296398, "learning_rate": 9.955476510945401e-06, "loss": 0.3317, "step": 615 }, { "epoch": 0.2789860506974651, "grad_norm": 0.0996130660835657, "learning_rate": 9.952823153884606e-06, "loss": 0.3449, "step": 620 }, { "epoch": 0.28123593820308984, "grad_norm": 0.07835665128694007, "learning_rate": 9.950093379960238e-06, "loss": 0.3397, "step": 625 }, { "epoch": 0.28348582570871456, "grad_norm": 0.09391607163130151, "learning_rate": 9.947287231289844e-06, "loss": 0.3776, "step": 630 }, { "epoch": 0.28573571321433927, "grad_norm": 0.09176019196191011, "learning_rate": 9.944404751169353e-06, "loss": 0.3722, "step": 635 }, { "epoch": 0.287985600719964, "grad_norm": 0.08406446635852077, "learning_rate": 9.941445984072408e-06, "loss": 0.338, "step": 640 }, { "epoch": 0.2902354882255887, "grad_norm": 0.08291987107240674, "learning_rate": 9.938410975649681e-06, "loss": 0.3742, "step": 645 }, { "epoch": 0.2924853757312134, "grad_norm": 0.08633205314263653, "learning_rate": 9.935299772728166e-06, "loss": 0.3611, "step": 650 }, { "epoch": 0.2947352632368382, "grad_norm": 0.07331502093091126, "learning_rate": 9.93211242331046e-06, "loss": 0.3344, "step": 655 }, { "epoch": 0.2969851507424629, "grad_norm": 0.08385804833550349, "learning_rate": 9.92884897657402e-06, "loss": 0.3557, "step": 660 }, { "epoch": 0.2992350382480876, "grad_norm": 0.07435080817645906, "learning_rate": 9.925509482870403e-06, "loss": 0.3405, "step": 665 }, { "epoch": 0.30148492575371233, "grad_norm": 0.08168530188324026, "learning_rate": 9.922093993724492e-06, "loss": 0.3426, "step": 670 }, { "epoch": 0.30373481325933704, "grad_norm": 0.08035820133808234, "learning_rate": 9.918602561833702e-06, "loss": 0.3604, "step": 675 }, { "epoch": 0.30598470076496176, "grad_norm": 0.08271219072869937, "learning_rate": 9.91503524106716e-06, "loss": 0.348, "step": 680 }, { "epoch": 0.3082345882705865, "grad_norm": 0.08921123968472987, "learning_rate": 9.911392086464886e-06, "loss": 0.3441, "step": 685 }, { "epoch": 0.3104844757762112, "grad_norm": 0.0839985353132867, "learning_rate": 9.907673154236929e-06, "loss": 0.3574, "step": 690 }, { "epoch": 0.3127343632818359, "grad_norm": 0.10386954688768853, "learning_rate": 9.903878501762511e-06, "loss": 0.3286, "step": 695 }, { "epoch": 0.3149842507874606, "grad_norm": 0.08778681814263677, "learning_rate": 9.900008187589138e-06, "loss": 0.3268, "step": 700 }, { "epoch": 0.31723413829308533, "grad_norm": 0.09027807015137441, "learning_rate": 9.896062271431697e-06, "loss": 0.3392, "step": 705 }, { "epoch": 0.31948402579871005, "grad_norm": 0.09567803807106381, "learning_rate": 9.89204081417153e-06, "loss": 0.3539, "step": 710 }, { "epoch": 0.32173391330433476, "grad_norm": 0.08574167684815145, "learning_rate": 9.887943877855505e-06, "loss": 0.3377, "step": 715 }, { "epoch": 0.3239838008099595, "grad_norm": 0.09260863383057749, "learning_rate": 9.883771525695052e-06, "loss": 0.3449, "step": 720 }, { "epoch": 0.32623368831558425, "grad_norm": 0.08495447140601177, "learning_rate": 9.879523822065181e-06, "loss": 0.3219, "step": 725 }, { "epoch": 0.32848357582120896, "grad_norm": 0.07533141152453762, "learning_rate": 9.875200832503505e-06, "loss": 0.3568, "step": 730 }, { "epoch": 0.3307334633268337, "grad_norm": 0.12247315370054979, "learning_rate": 9.870802623709215e-06, "loss": 0.3596, "step": 735 }, { "epoch": 0.3329833508324584, "grad_norm": 0.08738003894579985, "learning_rate": 9.866329263542055e-06, "loss": 0.3638, "step": 740 }, { "epoch": 0.3352332383380831, "grad_norm": 0.08338816245916761, "learning_rate": 9.861780821021282e-06, "loss": 0.3561, "step": 745 }, { "epoch": 0.3374831258437078, "grad_norm": 0.08236575366096931, "learning_rate": 9.857157366324587e-06, "loss": 0.3332, "step": 750 }, { "epoch": 0.33973301334933254, "grad_norm": 0.06944440484574142, "learning_rate": 9.852458970787027e-06, "loss": 0.357, "step": 755 }, { "epoch": 0.34198290085495725, "grad_norm": 0.07253573063652108, "learning_rate": 9.847685706899913e-06, "loss": 0.3245, "step": 760 }, { "epoch": 0.34423278836058196, "grad_norm": 0.07122505571988245, "learning_rate": 9.842837648309698e-06, "loss": 0.3528, "step": 765 }, { "epoch": 0.3464826758662067, "grad_norm": 0.07767571294888054, "learning_rate": 9.837914869816835e-06, "loss": 0.3395, "step": 770 }, { "epoch": 0.3487325633718314, "grad_norm": 0.07346709829835463, "learning_rate": 9.832917447374637e-06, "loss": 0.3648, "step": 775 }, { "epoch": 0.3509824508774561, "grad_norm": 0.08947827115200468, "learning_rate": 9.827845458088082e-06, "loss": 0.3521, "step": 780 }, { "epoch": 0.3532323383830808, "grad_norm": 0.06534661518603589, "learning_rate": 9.822698980212643e-06, "loss": 0.3366, "step": 785 }, { "epoch": 0.35548222588870554, "grad_norm": 0.08012548677368805, "learning_rate": 9.817478093153074e-06, "loss": 0.3752, "step": 790 }, { "epoch": 0.3577321133943303, "grad_norm": 0.08284074693974608, "learning_rate": 9.812182877462182e-06, "loss": 0.3337, "step": 795 }, { "epoch": 0.359982000899955, "grad_norm": 0.0962415375604297, "learning_rate": 9.806813414839588e-06, "loss": 0.3489, "step": 800 }, { "epoch": 0.36223188840557974, "grad_norm": 0.08299125800356132, "learning_rate": 9.801369788130468e-06, "loss": 0.3466, "step": 805 }, { "epoch": 0.36448177591120445, "grad_norm": 0.07196876922608039, "learning_rate": 9.795852081324266e-06, "loss": 0.3424, "step": 810 }, { "epoch": 0.36673166341682917, "grad_norm": 0.07251364573785335, "learning_rate": 9.79026037955341e-06, "loss": 0.3578, "step": 815 }, { "epoch": 0.3689815509224539, "grad_norm": 0.07779557103393991, "learning_rate": 9.784594769091989e-06, "loss": 0.3616, "step": 820 }, { "epoch": 0.3712314384280786, "grad_norm": 0.07434071832631806, "learning_rate": 9.778855337354426e-06, "loss": 0.3572, "step": 825 }, { "epoch": 0.3734813259337033, "grad_norm": 0.0761276852235193, "learning_rate": 9.77304217289413e-06, "loss": 0.3147, "step": 830 }, { "epoch": 0.375731213439328, "grad_norm": 0.08043122828466166, "learning_rate": 9.76715536540213e-06, "loss": 0.377, "step": 835 }, { "epoch": 0.37798110094495274, "grad_norm": 0.07418765173136689, "learning_rate": 9.761195005705685e-06, "loss": 0.3198, "step": 840 }, { "epoch": 0.38023098845057746, "grad_norm": 0.08536316659010101, "learning_rate": 9.755161185766891e-06, "loss": 0.3324, "step": 845 }, { "epoch": 0.38248087595620217, "grad_norm": 0.07947600210593922, "learning_rate": 9.74905399868126e-06, "loss": 0.3618, "step": 850 }, { "epoch": 0.3847307634618269, "grad_norm": 0.0744113590241544, "learning_rate": 9.742873538676274e-06, "loss": 0.3402, "step": 855 }, { "epoch": 0.3869806509674516, "grad_norm": 0.06709744864423575, "learning_rate": 9.73661990110995e-06, "loss": 0.3337, "step": 860 }, { "epoch": 0.38923053847307637, "grad_norm": 0.07721094678627155, "learning_rate": 9.73029318246935e-06, "loss": 0.3473, "step": 865 }, { "epoch": 0.3914804259787011, "grad_norm": 0.07436794628188735, "learning_rate": 9.723893480369106e-06, "loss": 0.3227, "step": 870 }, { "epoch": 0.3937303134843258, "grad_norm": 0.08184087425329187, "learning_rate": 9.717420893549902e-06, "loss": 0.3271, "step": 875 }, { "epoch": 0.3959802009899505, "grad_norm": 0.07323707936362174, "learning_rate": 9.71087552187696e-06, "loss": 0.3353, "step": 880 }, { "epoch": 0.39823008849557523, "grad_norm": 0.07407984516514123, "learning_rate": 9.7042574663385e-06, "loss": 0.3405, "step": 885 }, { "epoch": 0.40047997600119994, "grad_norm": 0.06705082859053621, "learning_rate": 9.697566829044172e-06, "loss": 0.3335, "step": 890 }, { "epoch": 0.40272986350682466, "grad_norm": 0.06417105200135667, "learning_rate": 9.690803713223485e-06, "loss": 0.3632, "step": 895 }, { "epoch": 0.4049797510124494, "grad_norm": 0.07661580482483403, "learning_rate": 9.68396822322422e-06, "loss": 0.341, "step": 900 }, { "epoch": 0.4072296385180741, "grad_norm": 0.07783982481846635, "learning_rate": 9.677060464510817e-06, "loss": 0.3422, "step": 905 }, { "epoch": 0.4094795260236988, "grad_norm": 0.07934781483289755, "learning_rate": 9.670080543662742e-06, "loss": 0.344, "step": 910 }, { "epoch": 0.4117294135293235, "grad_norm": 0.07206722738626223, "learning_rate": 9.663028568372845e-06, "loss": 0.3563, "step": 915 }, { "epoch": 0.41397930103494823, "grad_norm": 0.06767347411319052, "learning_rate": 9.655904647445711e-06, "loss": 0.3231, "step": 920 }, { "epoch": 0.41622918854057295, "grad_norm": 0.07180782228261029, "learning_rate": 9.64870889079596e-06, "loss": 0.3287, "step": 925 }, { "epoch": 0.4184790760461977, "grad_norm": 0.07242610923174227, "learning_rate": 9.641441409446563e-06, "loss": 0.3487, "step": 930 }, { "epoch": 0.42072896355182243, "grad_norm": 0.06832390188318747, "learning_rate": 9.634102315527136e-06, "loss": 0.325, "step": 935 }, { "epoch": 0.42297885105744715, "grad_norm": 0.07856703769371849, "learning_rate": 9.626691722272193e-06, "loss": 0.3458, "step": 940 }, { "epoch": 0.42522873856307186, "grad_norm": 0.0663937348509602, "learning_rate": 9.61920974401941e-06, "loss": 0.3513, "step": 945 }, { "epoch": 0.4274786260686966, "grad_norm": 0.07114607462059036, "learning_rate": 9.611656496207861e-06, "loss": 0.3474, "step": 950 }, { "epoch": 0.4297285135743213, "grad_norm": 0.07603014864007235, "learning_rate": 9.604032095376234e-06, "loss": 0.3362, "step": 955 }, { "epoch": 0.431978401079946, "grad_norm": 0.0734531353849079, "learning_rate": 9.596336659161031e-06, "loss": 0.3445, "step": 960 }, { "epoch": 0.4342282885855707, "grad_norm": 0.061596458285852376, "learning_rate": 9.588570306294759e-06, "loss": 0.3453, "step": 965 }, { "epoch": 0.43647817609119544, "grad_norm": 0.05885162798568731, "learning_rate": 9.58073315660409e-06, "loss": 0.3439, "step": 970 }, { "epoch": 0.43872806359682015, "grad_norm": 0.07082727968014366, "learning_rate": 9.57282533100802e-06, "loss": 0.3395, "step": 975 }, { "epoch": 0.44097795110244487, "grad_norm": 0.07316435404238263, "learning_rate": 9.564846951515997e-06, "loss": 0.3304, "step": 980 }, { "epoch": 0.4432278386080696, "grad_norm": 0.07444841963108913, "learning_rate": 9.55679814122605e-06, "loss": 0.3298, "step": 985 }, { "epoch": 0.4454777261136943, "grad_norm": 0.07294271191699972, "learning_rate": 9.548679024322866e-06, "loss": 0.3463, "step": 990 }, { "epoch": 0.447727613619319, "grad_norm": 0.07031942249727262, "learning_rate": 9.540489726075907e-06, "loss": 0.3486, "step": 995 }, { "epoch": 0.4499775011249438, "grad_norm": 0.07151326035389519, "learning_rate": 9.532230372837446e-06, "loss": 0.3537, "step": 1000 }, { "epoch": 0.4522273886305685, "grad_norm": 0.0671028535664748, "learning_rate": 9.523901092040634e-06, "loss": 0.3455, "step": 1005 }, { "epoch": 0.4544772761361932, "grad_norm": 0.07197014184781744, "learning_rate": 9.51550201219754e-06, "loss": 0.3432, "step": 1010 }, { "epoch": 0.4567271636418179, "grad_norm": 0.07169196920459484, "learning_rate": 9.507033262897142e-06, "loss": 0.31, "step": 1015 }, { "epoch": 0.45897705114744264, "grad_norm": 0.07109226686317548, "learning_rate": 9.498494974803362e-06, "loss": 0.3663, "step": 1020 }, { "epoch": 0.46122693865306735, "grad_norm": 0.05804652011529642, "learning_rate": 9.489887279653023e-06, "loss": 0.3194, "step": 1025 }, { "epoch": 0.46347682615869207, "grad_norm": 0.0700778438901929, "learning_rate": 9.481210310253826e-06, "loss": 0.3167, "step": 1030 }, { "epoch": 0.4657267136643168, "grad_norm": 0.06244080013341172, "learning_rate": 9.472464200482303e-06, "loss": 0.3127, "step": 1035 }, { "epoch": 0.4679766011699415, "grad_norm": 0.06903401204251029, "learning_rate": 9.463649085281752e-06, "loss": 0.3259, "step": 1040 }, { "epoch": 0.4702264886755662, "grad_norm": 0.07317408098224049, "learning_rate": 9.454765100660144e-06, "loss": 0.3446, "step": 1045 }, { "epoch": 0.4724763761811909, "grad_norm": 0.06487603568640564, "learning_rate": 9.445812383688046e-06, "loss": 0.3418, "step": 1050 }, { "epoch": 0.47472626368681564, "grad_norm": 0.06587470603877191, "learning_rate": 9.43679107249648e-06, "loss": 0.3473, "step": 1055 }, { "epoch": 0.47697615119244036, "grad_norm": 0.07107259617908306, "learning_rate": 9.427701306274812e-06, "loss": 0.337, "step": 1060 }, { "epoch": 0.47922603869806507, "grad_norm": 0.06697594936792645, "learning_rate": 9.418543225268598e-06, "loss": 0.3429, "step": 1065 }, { "epoch": 0.48147592620368984, "grad_norm": 0.0682858638376316, "learning_rate": 9.40931697077741e-06, "loss": 0.3358, "step": 1070 }, { "epoch": 0.48372581370931456, "grad_norm": 0.07619891304792806, "learning_rate": 9.400022685152683e-06, "loss": 0.3333, "step": 1075 }, { "epoch": 0.48597570121493927, "grad_norm": 0.07522989171574869, "learning_rate": 9.390660511795481e-06, "loss": 0.3587, "step": 1080 }, { "epoch": 0.488225588720564, "grad_norm": 0.07244707737339262, "learning_rate": 9.381230595154319e-06, "loss": 0.3386, "step": 1085 }, { "epoch": 0.4904754762261887, "grad_norm": 0.0747628006572659, "learning_rate": 9.371733080722911e-06, "loss": 0.3457, "step": 1090 }, { "epoch": 0.4927253637318134, "grad_norm": 0.0687511407497147, "learning_rate": 9.362168115037942e-06, "loss": 0.3433, "step": 1095 }, { "epoch": 0.49497525123743813, "grad_norm": 0.07512269519367433, "learning_rate": 9.352535845676791e-06, "loss": 0.3219, "step": 1100 }, { "epoch": 0.49722513874306284, "grad_norm": 0.07246031317089945, "learning_rate": 9.342836421255268e-06, "loss": 0.322, "step": 1105 }, { "epoch": 0.49947502624868756, "grad_norm": 0.07594536131369899, "learning_rate": 9.333069991425313e-06, "loss": 0.3589, "step": 1110 }, { "epoch": 0.5017249137543123, "grad_norm": 0.06689469633356987, "learning_rate": 9.323236706872685e-06, "loss": 0.357, "step": 1115 }, { "epoch": 0.503974801259937, "grad_norm": 0.06930147639704634, "learning_rate": 9.31333671931465e-06, "loss": 0.3263, "step": 1120 }, { "epoch": 0.5062246887655617, "grad_norm": 0.06133351525533005, "learning_rate": 9.303370181497623e-06, "loss": 0.3422, "step": 1125 }, { "epoch": 0.5084745762711864, "grad_norm": 0.07604072880215484, "learning_rate": 9.293337247194827e-06, "loss": 0.359, "step": 1130 }, { "epoch": 0.5107244637768111, "grad_norm": 0.06955762934734898, "learning_rate": 9.283238071203907e-06, "loss": 0.3439, "step": 1135 }, { "epoch": 0.5129743512824358, "grad_norm": 0.0803346877614296, "learning_rate": 9.27307280934455e-06, "loss": 0.3471, "step": 1140 }, { "epoch": 0.5152242387880606, "grad_norm": 0.06622678263367843, "learning_rate": 9.26284161845608e-06, "loss": 0.3427, "step": 1145 }, { "epoch": 0.5174741262936853, "grad_norm": 0.0709676185753263, "learning_rate": 9.252544656395033e-06, "loss": 0.3363, "step": 1150 }, { "epoch": 0.51972401379931, "grad_norm": 0.08282162237795766, "learning_rate": 9.242182082032729e-06, "loss": 0.341, "step": 1155 }, { "epoch": 0.5219739013049347, "grad_norm": 0.06233679346455434, "learning_rate": 9.231754055252817e-06, "loss": 0.3308, "step": 1160 }, { "epoch": 0.5242237888105594, "grad_norm": 0.05949370637515577, "learning_rate": 9.221260736948803e-06, "loss": 0.3254, "step": 1165 }, { "epoch": 0.5264736763161841, "grad_norm": 0.06102505633204194, "learning_rate": 9.21070228902158e-06, "loss": 0.327, "step": 1170 }, { "epoch": 0.528723563821809, "grad_norm": 0.07096293590033853, "learning_rate": 9.200078874376917e-06, "loss": 0.3309, "step": 1175 }, { "epoch": 0.5309734513274337, "grad_norm": 0.06374179754335971, "learning_rate": 9.189390656922955e-06, "loss": 0.3579, "step": 1180 }, { "epoch": 0.5332233388330584, "grad_norm": 0.09643830344296066, "learning_rate": 9.17863780156767e-06, "loss": 0.3466, "step": 1185 }, { "epoch": 0.5354732263386831, "grad_norm": 0.0652384061049577, "learning_rate": 9.167820474216337e-06, "loss": 0.3523, "step": 1190 }, { "epoch": 0.5377231138443078, "grad_norm": 0.06430574295906281, "learning_rate": 9.156938841768965e-06, "loss": 0.3722, "step": 1195 }, { "epoch": 0.5399730013499325, "grad_norm": 0.07648802804062793, "learning_rate": 9.145993072117724e-06, "loss": 0.321, "step": 1200 }, { "epoch": 0.5422228888555573, "grad_norm": 0.06775418329662553, "learning_rate": 9.134983334144352e-06, "loss": 0.3549, "step": 1205 }, { "epoch": 0.544472776361182, "grad_norm": 0.076334857238285, "learning_rate": 9.123909797717551e-06, "loss": 0.335, "step": 1210 }, { "epoch": 0.5467226638668067, "grad_norm": 0.06576432515389055, "learning_rate": 9.112772633690368e-06, "loss": 0.3239, "step": 1215 }, { "epoch": 0.5489725513724314, "grad_norm": 0.06872638373228167, "learning_rate": 9.101572013897555e-06, "loss": 0.3141, "step": 1220 }, { "epoch": 0.5512224388780561, "grad_norm": 0.06158733598122966, "learning_rate": 9.090308111152924e-06, "loss": 0.3221, "step": 1225 }, { "epoch": 0.5534723263836808, "grad_norm": 0.08097819934773681, "learning_rate": 9.07898109924667e-06, "loss": 0.3151, "step": 1230 }, { "epoch": 0.5557222138893055, "grad_norm": 0.06764912622152554, "learning_rate": 9.067591152942701e-06, "loss": 0.3332, "step": 1235 }, { "epoch": 0.5579721013949303, "grad_norm": 0.07314176615388208, "learning_rate": 9.056138447975936e-06, "loss": 0.3415, "step": 1240 }, { "epoch": 0.560221988900555, "grad_norm": 0.0717387579544613, "learning_rate": 9.044623161049594e-06, "loss": 0.3386, "step": 1245 }, { "epoch": 0.5624718764061797, "grad_norm": 0.07552097065323739, "learning_rate": 9.033045469832467e-06, "loss": 0.3569, "step": 1250 }, { "epoch": 0.5647217639118044, "grad_norm": 0.06915693480180615, "learning_rate": 9.02140555295618e-06, "loss": 0.3222, "step": 1255 }, { "epoch": 0.5669716514174291, "grad_norm": 0.07769020322155092, "learning_rate": 9.009703590012434e-06, "loss": 0.3185, "step": 1260 }, { "epoch": 0.5692215389230538, "grad_norm": 0.07598860570344396, "learning_rate": 8.997939761550239e-06, "loss": 0.3522, "step": 1265 }, { "epoch": 0.5714714264286785, "grad_norm": 0.07073748495565614, "learning_rate": 8.986114249073122e-06, "loss": 0.3169, "step": 1270 }, { "epoch": 0.5737213139343033, "grad_norm": 0.06866551274687982, "learning_rate": 8.97422723503633e-06, "loss": 0.3304, "step": 1275 }, { "epoch": 0.575971201439928, "grad_norm": 0.07075202015965712, "learning_rate": 8.962278902844016e-06, "loss": 0.3309, "step": 1280 }, { "epoch": 0.5782210889455527, "grad_norm": 0.07165184953921011, "learning_rate": 8.950269436846405e-06, "loss": 0.331, "step": 1285 }, { "epoch": 0.5804709764511774, "grad_norm": 0.06433134595791733, "learning_rate": 8.938199022336956e-06, "loss": 0.328, "step": 1290 }, { "epoch": 0.5827208639568021, "grad_norm": 0.07003765990675229, "learning_rate": 8.926067845549495e-06, "loss": 0.3297, "step": 1295 }, { "epoch": 0.5849707514624268, "grad_norm": 0.06653035126789796, "learning_rate": 8.913876093655351e-06, "loss": 0.335, "step": 1300 }, { "epoch": 0.5872206389680515, "grad_norm": 0.06847091877632593, "learning_rate": 8.90162395476046e-06, "loss": 0.3279, "step": 1305 }, { "epoch": 0.5894705264736764, "grad_norm": 0.06903452581161729, "learning_rate": 8.889311617902468e-06, "loss": 0.3229, "step": 1310 }, { "epoch": 0.5917204139793011, "grad_norm": 0.09341128215879058, "learning_rate": 8.876939273047813e-06, "loss": 0.299, "step": 1315 }, { "epoch": 0.5939703014849258, "grad_norm": 0.07278846497904187, "learning_rate": 8.86450711108879e-06, "loss": 0.3226, "step": 1320 }, { "epoch": 0.5962201889905505, "grad_norm": 0.06262188507904164, "learning_rate": 8.85201532384061e-06, "loss": 0.3133, "step": 1325 }, { "epoch": 0.5984700764961752, "grad_norm": 0.06880573696178596, "learning_rate": 8.839464104038445e-06, "loss": 0.2962, "step": 1330 }, { "epoch": 0.6007199640017999, "grad_norm": 0.06779234501270573, "learning_rate": 8.826853645334441e-06, "loss": 0.3124, "step": 1335 }, { "epoch": 0.6029698515074247, "grad_norm": 0.06187208772150342, "learning_rate": 8.814184142294744e-06, "loss": 0.315, "step": 1340 }, { "epoch": 0.6052197390130494, "grad_norm": 0.07896028281919827, "learning_rate": 8.80145579039649e-06, "loss": 0.3432, "step": 1345 }, { "epoch": 0.6074696265186741, "grad_norm": 0.0751755215796296, "learning_rate": 8.78866878602479e-06, "loss": 0.3239, "step": 1350 }, { "epoch": 0.6097195140242988, "grad_norm": 0.07156093608605772, "learning_rate": 8.775823326469703e-06, "loss": 0.337, "step": 1355 }, { "epoch": 0.6119694015299235, "grad_norm": 0.0727876886960586, "learning_rate": 8.76291960992319e-06, "loss": 0.3737, "step": 1360 }, { "epoch": 0.6142192890355482, "grad_norm": 0.07239709772207241, "learning_rate": 8.749957835476053e-06, "loss": 0.333, "step": 1365 }, { "epoch": 0.616469176541173, "grad_norm": 0.07320801736790428, "learning_rate": 8.736938203114872e-06, "loss": 0.344, "step": 1370 }, { "epoch": 0.6187190640467977, "grad_norm": 0.06549770224319154, "learning_rate": 8.72386091371891e-06, "loss": 0.3006, "step": 1375 }, { "epoch": 0.6209689515524224, "grad_norm": 0.07264319355187582, "learning_rate": 8.710726169057018e-06, "loss": 0.3173, "step": 1380 }, { "epoch": 0.6232188390580471, "grad_norm": 0.07992193946978773, "learning_rate": 8.697534171784523e-06, "loss": 0.3467, "step": 1385 }, { "epoch": 0.6254687265636718, "grad_norm": 0.06707456122943496, "learning_rate": 8.684285125440099e-06, "loss": 0.3297, "step": 1390 }, { "epoch": 0.6277186140692965, "grad_norm": 0.06483948438605809, "learning_rate": 8.670979234442624e-06, "loss": 0.3349, "step": 1395 }, { "epoch": 0.6299685015749212, "grad_norm": 0.0705394359218232, "learning_rate": 8.657616704088037e-06, "loss": 0.33, "step": 1400 }, { "epoch": 0.632218389080546, "grad_norm": 0.07985592647370479, "learning_rate": 8.644197740546153e-06, "loss": 0.3605, "step": 1405 }, { "epoch": 0.6344682765861707, "grad_norm": 0.08150029983078208, "learning_rate": 8.630722550857503e-06, "loss": 0.3363, "step": 1410 }, { "epoch": 0.6367181640917954, "grad_norm": 0.07286616055279489, "learning_rate": 8.617191342930118e-06, "loss": 0.3441, "step": 1415 }, { "epoch": 0.6389680515974201, "grad_norm": 0.06834780355739174, "learning_rate": 8.603604325536338e-06, "loss": 0.3298, "step": 1420 }, { "epoch": 0.6412179391030448, "grad_norm": 0.06360740971285378, "learning_rate": 8.589961708309582e-06, "loss": 0.308, "step": 1425 }, { "epoch": 0.6434678266086695, "grad_norm": 0.06387011586281786, "learning_rate": 8.576263701741115e-06, "loss": 0.3102, "step": 1430 }, { "epoch": 0.6457177141142942, "grad_norm": 0.059287022702283844, "learning_rate": 8.562510517176807e-06, "loss": 0.333, "step": 1435 }, { "epoch": 0.647967601619919, "grad_norm": 0.07076357545448068, "learning_rate": 8.54870236681386e-06, "loss": 0.3376, "step": 1440 }, { "epoch": 0.6502174891255437, "grad_norm": 0.08080237359735847, "learning_rate": 8.534839463697541e-06, "loss": 0.344, "step": 1445 }, { "epoch": 0.6524673766311685, "grad_norm": 0.07301903865415799, "learning_rate": 8.520922021717903e-06, "loss": 0.3236, "step": 1450 }, { "epoch": 0.6547172641367932, "grad_norm": 0.06408247558471158, "learning_rate": 8.506950255606466e-06, "loss": 0.3119, "step": 1455 }, { "epoch": 0.6569671516424179, "grad_norm": 0.07142979546900464, "learning_rate": 8.492924380932919e-06, "loss": 0.3235, "step": 1460 }, { "epoch": 0.6592170391480426, "grad_norm": 0.06623934823982494, "learning_rate": 8.478844614101792e-06, "loss": 0.3127, "step": 1465 }, { "epoch": 0.6614669266536674, "grad_norm": 0.064051288527217, "learning_rate": 8.464711172349105e-06, "loss": 0.3408, "step": 1470 }, { "epoch": 0.6637168141592921, "grad_norm": 0.07268808159369747, "learning_rate": 8.450524273739036e-06, "loss": 0.3406, "step": 1475 }, { "epoch": 0.6659667016649168, "grad_norm": 0.06547939077675495, "learning_rate": 8.436284137160544e-06, "loss": 0.3404, "step": 1480 }, { "epoch": 0.6682165891705415, "grad_norm": 0.11515221274329139, "learning_rate": 8.421990982323988e-06, "loss": 0.3342, "step": 1485 }, { "epoch": 0.6704664766761662, "grad_norm": 0.08213057865060075, "learning_rate": 8.407645029757752e-06, "loss": 0.3631, "step": 1490 }, { "epoch": 0.6727163641817909, "grad_norm": 0.07445316775297253, "learning_rate": 8.393246500804825e-06, "loss": 0.362, "step": 1495 }, { "epoch": 0.6749662516874156, "grad_norm": 0.0715773585848479, "learning_rate": 8.3787956176194e-06, "loss": 0.3377, "step": 1500 }, { "epoch": 0.6772161391930404, "grad_norm": 0.07245667708706742, "learning_rate": 8.36429260316344e-06, "loss": 0.2967, "step": 1505 }, { "epoch": 0.6794660266986651, "grad_norm": 0.07191851967760118, "learning_rate": 8.349737681203234e-06, "loss": 0.3447, "step": 1510 }, { "epoch": 0.6817159142042898, "grad_norm": 0.06475853768493092, "learning_rate": 8.335131076305958e-06, "loss": 0.3339, "step": 1515 }, { "epoch": 0.6839658017099145, "grad_norm": 0.07263158877410257, "learning_rate": 8.320473013836197e-06, "loss": 0.3074, "step": 1520 }, { "epoch": 0.6862156892155392, "grad_norm": 0.05942603663221257, "learning_rate": 8.305763719952467e-06, "loss": 0.2997, "step": 1525 }, { "epoch": 0.6884655767211639, "grad_norm": 0.07938168227761808, "learning_rate": 8.29100342160374e-06, "loss": 0.3122, "step": 1530 }, { "epoch": 0.6907154642267886, "grad_norm": 0.07999223664242092, "learning_rate": 8.27619234652593e-06, "loss": 0.3138, "step": 1535 }, { "epoch": 0.6929653517324134, "grad_norm": 0.06996270561203156, "learning_rate": 8.261330723238381e-06, "loss": 0.3321, "step": 1540 }, { "epoch": 0.6952152392380381, "grad_norm": 0.09647113986832291, "learning_rate": 8.246418781040345e-06, "loss": 0.3269, "step": 1545 }, { "epoch": 0.6974651267436628, "grad_norm": 0.07974144611519904, "learning_rate": 8.231456750007436e-06, "loss": 0.309, "step": 1550 }, { "epoch": 0.6997150142492875, "grad_norm": 0.06944041746000827, "learning_rate": 8.216444860988098e-06, "loss": 0.3347, "step": 1555 }, { "epoch": 0.7019649017549122, "grad_norm": 0.06697853007490644, "learning_rate": 8.20138334560002e-06, "loss": 0.3432, "step": 1560 }, { "epoch": 0.7042147892605369, "grad_norm": 0.09163411149931353, "learning_rate": 8.18627243622658e-06, "loss": 0.3294, "step": 1565 }, { "epoch": 0.7064646767661616, "grad_norm": 0.06745466757701833, "learning_rate": 8.171112366013252e-06, "loss": 0.3382, "step": 1570 }, { "epoch": 0.7087145642717864, "grad_norm": 0.06524545139947452, "learning_rate": 8.155903368864008e-06, "loss": 0.2894, "step": 1575 }, { "epoch": 0.7109644517774111, "grad_norm": 0.07357575023935092, "learning_rate": 8.140645679437713e-06, "loss": 0.345, "step": 1580 }, { "epoch": 0.7132143392830359, "grad_norm": 0.07032356069075725, "learning_rate": 8.125339533144507e-06, "loss": 0.3497, "step": 1585 }, { "epoch": 0.7154642267886606, "grad_norm": 0.07305825316899144, "learning_rate": 8.109985166142161e-06, "loss": 0.3223, "step": 1590 }, { "epoch": 0.7177141142942853, "grad_norm": 0.07026921859976491, "learning_rate": 8.09458281533244e-06, "loss": 0.3271, "step": 1595 }, { "epoch": 0.71996400179991, "grad_norm": 0.0783084169696169, "learning_rate": 8.079132718357465e-06, "loss": 0.311, "step": 1600 }, { "epoch": 0.7222138893055348, "grad_norm": 0.06977970059586212, "learning_rate": 8.063635113596006e-06, "loss": 0.3114, "step": 1605 }, { "epoch": 0.7244637768111595, "grad_norm": 0.06695382649927473, "learning_rate": 8.048090240159849e-06, "loss": 0.3186, "step": 1610 }, { "epoch": 0.7267136643167842, "grad_norm": 0.07382767142740718, "learning_rate": 8.032498337890073e-06, "loss": 0.3115, "step": 1615 }, { "epoch": 0.7289635518224089, "grad_norm": 0.0847147954522355, "learning_rate": 8.01685964735337e-06, "loss": 0.3313, "step": 1620 }, { "epoch": 0.7312134393280336, "grad_norm": 0.08710412831256738, "learning_rate": 8.00117440983832e-06, "loss": 0.3129, "step": 1625 }, { "epoch": 0.7334633268336583, "grad_norm": 0.07163605298015002, "learning_rate": 7.985442867351682e-06, "loss": 0.3197, "step": 1630 }, { "epoch": 0.735713214339283, "grad_norm": 0.08693479896494097, "learning_rate": 7.969665262614642e-06, "loss": 0.3584, "step": 1635 }, { "epoch": 0.7379631018449078, "grad_norm": 0.07181692085074703, "learning_rate": 7.953841839059086e-06, "loss": 0.3024, "step": 1640 }, { "epoch": 0.7402129893505325, "grad_norm": 0.06762138099885763, "learning_rate": 7.937972840823836e-06, "loss": 0.3393, "step": 1645 }, { "epoch": 0.7424628768561572, "grad_norm": 0.06579548855422006, "learning_rate": 7.922058512750876e-06, "loss": 0.3415, "step": 1650 }, { "epoch": 0.7447127643617819, "grad_norm": 0.06780731913871438, "learning_rate": 7.90609910038159e-06, "loss": 0.326, "step": 1655 }, { "epoch": 0.7469626518674066, "grad_norm": 0.07345224322730477, "learning_rate": 7.890094849952964e-06, "loss": 0.3579, "step": 1660 }, { "epoch": 0.7492125393730313, "grad_norm": 0.07643898702300285, "learning_rate": 7.874046008393783e-06, "loss": 0.3215, "step": 1665 }, { "epoch": 0.751462426878656, "grad_norm": 0.08111641352223722, "learning_rate": 7.857952823320833e-06, "loss": 0.3396, "step": 1670 }, { "epoch": 0.7537123143842808, "grad_norm": 0.06433102937848656, "learning_rate": 7.84181554303507e-06, "loss": 0.3229, "step": 1675 }, { "epoch": 0.7559622018899055, "grad_norm": 0.07066161687549372, "learning_rate": 7.825634416517793e-06, "loss": 0.3168, "step": 1680 }, { "epoch": 0.7582120893955302, "grad_norm": 0.07761037252783486, "learning_rate": 7.809409693426803e-06, "loss": 0.345, "step": 1685 }, { "epoch": 0.7604619769011549, "grad_norm": 0.07514558565636438, "learning_rate": 7.793141624092551e-06, "loss": 0.3423, "step": 1690 }, { "epoch": 0.7627118644067796, "grad_norm": 0.08138341842898199, "learning_rate": 7.776830459514275e-06, "loss": 0.3153, "step": 1695 }, { "epoch": 0.7649617519124043, "grad_norm": 0.07657999183778645, "learning_rate": 7.760476451356123e-06, "loss": 0.3568, "step": 1700 }, { "epoch": 0.7672116394180291, "grad_norm": 0.08932610854441203, "learning_rate": 7.744079851943286e-06, "loss": 0.3045, "step": 1705 }, { "epoch": 0.7694615269236538, "grad_norm": 0.07788581856311123, "learning_rate": 7.727640914258076e-06, "loss": 0.322, "step": 1710 }, { "epoch": 0.7717114144292785, "grad_norm": 0.07234842557571529, "learning_rate": 7.711159891936059e-06, "loss": 0.301, "step": 1715 }, { "epoch": 0.7739613019349032, "grad_norm": 0.08103908033954604, "learning_rate": 7.694637039262109e-06, "loss": 0.2934, "step": 1720 }, { "epoch": 0.776211189440528, "grad_norm": 0.08033609793206774, "learning_rate": 7.678072611166503e-06, "loss": 0.3281, "step": 1725 }, { "epoch": 0.7784610769461527, "grad_norm": 0.08682473308042656, "learning_rate": 7.661466863220982e-06, "loss": 0.3377, "step": 1730 }, { "epoch": 0.7807109644517775, "grad_norm": 0.07975724023981283, "learning_rate": 7.644820051634813e-06, "loss": 0.3312, "step": 1735 }, { "epoch": 0.7829608519574022, "grad_norm": 0.08318610850328363, "learning_rate": 7.628132433250828e-06, "loss": 0.318, "step": 1740 }, { "epoch": 0.7852107394630269, "grad_norm": 0.07470444210188223, "learning_rate": 7.611404265541464e-06, "loss": 0.3166, "step": 1745 }, { "epoch": 0.7874606269686516, "grad_norm": 0.07633984339680623, "learning_rate": 7.594635806604797e-06, "loss": 0.3068, "step": 1750 }, { "epoch": 0.7897105144742763, "grad_norm": 0.08519611137288997, "learning_rate": 7.57782731516055e-06, "loss": 0.3465, "step": 1755 }, { "epoch": 0.791960401979901, "grad_norm": 0.07125104461336126, "learning_rate": 7.560979050546103e-06, "loss": 0.311, "step": 1760 }, { "epoch": 0.7942102894855257, "grad_norm": 0.08460045098046377, "learning_rate": 7.544091272712501e-06, "loss": 0.3036, "step": 1765 }, { "epoch": 0.7964601769911505, "grad_norm": 0.07731671038628908, "learning_rate": 7.527164242220434e-06, "loss": 0.3214, "step": 1770 }, { "epoch": 0.7987100644967752, "grad_norm": 0.07618452283812552, "learning_rate": 7.510198220236217e-06, "loss": 0.3412, "step": 1775 }, { "epoch": 0.8009599520023999, "grad_norm": 0.08122249298530079, "learning_rate": 7.493193468527764e-06, "loss": 0.3129, "step": 1780 }, { "epoch": 0.8032098395080246, "grad_norm": 0.08390625774458342, "learning_rate": 7.476150249460549e-06, "loss": 0.3168, "step": 1785 }, { "epoch": 0.8054597270136493, "grad_norm": 0.07518471851900174, "learning_rate": 7.4590688259935554e-06, "loss": 0.331, "step": 1790 }, { "epoch": 0.807709614519274, "grad_norm": 0.07627971467235234, "learning_rate": 7.441949461675223e-06, "loss": 0.3471, "step": 1795 }, { "epoch": 0.8099595020248987, "grad_norm": 0.08879967466572108, "learning_rate": 7.424792420639377e-06, "loss": 0.323, "step": 1800 }, { "epoch": 0.8122093895305235, "grad_norm": 0.0858174200658171, "learning_rate": 7.407597967601155e-06, "loss": 0.3284, "step": 1805 }, { "epoch": 0.8144592770361482, "grad_norm": 0.08665127583082709, "learning_rate": 7.390366367852923e-06, "loss": 0.3217, "step": 1810 }, { "epoch": 0.8167091645417729, "grad_norm": 0.08001080258785544, "learning_rate": 7.3730978872601825e-06, "loss": 0.3248, "step": 1815 }, { "epoch": 0.8189590520473976, "grad_norm": 0.07815794847284734, "learning_rate": 7.355792792257463e-06, "loss": 0.3124, "step": 1820 }, { "epoch": 0.8212089395530223, "grad_norm": 0.0869139056537896, "learning_rate": 7.338451349844225e-06, "loss": 0.323, "step": 1825 }, { "epoch": 0.823458827058647, "grad_norm": 0.09766019302119812, "learning_rate": 7.3210738275807225e-06, "loss": 0.3332, "step": 1830 }, { "epoch": 0.8257087145642718, "grad_norm": 0.08508749834617443, "learning_rate": 7.303660493583889e-06, "loss": 0.3285, "step": 1835 }, { "epoch": 0.8279586020698965, "grad_norm": 0.10673197384722342, "learning_rate": 7.286211616523193e-06, "loss": 0.3169, "step": 1840 }, { "epoch": 0.8302084895755212, "grad_norm": 0.11681882774169298, "learning_rate": 7.268727465616497e-06, "loss": 0.331, "step": 1845 }, { "epoch": 0.8324583770811459, "grad_norm": 0.08970145688216963, "learning_rate": 7.251208310625899e-06, "loss": 0.3262, "step": 1850 }, { "epoch": 0.8347082645867706, "grad_norm": 0.08677453595649923, "learning_rate": 7.2336544218535776e-06, "loss": 0.2968, "step": 1855 }, { "epoch": 0.8369581520923954, "grad_norm": 0.08463356362517462, "learning_rate": 7.216066070137614e-06, "loss": 0.3408, "step": 1860 }, { "epoch": 0.8392080395980202, "grad_norm": 0.10768608728008885, "learning_rate": 7.198443526847816e-06, "loss": 0.3222, "step": 1865 }, { "epoch": 0.8414579271036449, "grad_norm": 0.08293925088501428, "learning_rate": 7.180787063881534e-06, "loss": 0.3225, "step": 1870 }, { "epoch": 0.8437078146092696, "grad_norm": 0.09753175069029144, "learning_rate": 7.163096953659462e-06, "loss": 0.3249, "step": 1875 }, { "epoch": 0.8459577021148943, "grad_norm": 0.10750990409191725, "learning_rate": 7.145373469121435e-06, "loss": 0.3248, "step": 1880 }, { "epoch": 0.848207589620519, "grad_norm": 0.0713111477001828, "learning_rate": 7.1276168837222215e-06, "loss": 0.3262, "step": 1885 }, { "epoch": 0.8504574771261437, "grad_norm": 0.08520099737279731, "learning_rate": 7.109827471427299e-06, "loss": 0.3248, "step": 1890 }, { "epoch": 0.8527073646317684, "grad_norm": 0.1007558956965131, "learning_rate": 7.092005506708629e-06, "loss": 0.3063, "step": 1895 }, { "epoch": 0.8549572521373932, "grad_norm": 0.10076509216745107, "learning_rate": 7.074151264540425e-06, "loss": 0.3394, "step": 1900 }, { "epoch": 0.8572071396430179, "grad_norm": 0.1128171772187796, "learning_rate": 7.056265020394908e-06, "loss": 0.3353, "step": 1905 }, { "epoch": 0.8594570271486426, "grad_norm": 0.07826929688060387, "learning_rate": 7.038347050238052e-06, "loss": 0.3313, "step": 1910 }, { "epoch": 0.8617069146542673, "grad_norm": 0.09477827641455178, "learning_rate": 7.020397630525336e-06, "loss": 0.3094, "step": 1915 }, { "epoch": 0.863956802159892, "grad_norm": 0.07996338853084985, "learning_rate": 7.002417038197466e-06, "loss": 0.3365, "step": 1920 }, { "epoch": 0.8662066896655167, "grad_norm": 0.08681950662638242, "learning_rate": 6.984405550676113e-06, "loss": 0.2858, "step": 1925 }, { "epoch": 0.8684565771711414, "grad_norm": 0.10147028587889259, "learning_rate": 6.966363445859629e-06, "loss": 0.3307, "step": 1930 }, { "epoch": 0.8707064646767662, "grad_norm": 0.09778557000247115, "learning_rate": 6.948291002118757e-06, "loss": 0.3346, "step": 1935 }, { "epoch": 0.8729563521823909, "grad_norm": 0.08335343107919917, "learning_rate": 6.930188498292334e-06, "loss": 0.3102, "step": 1940 }, { "epoch": 0.8752062396880156, "grad_norm": 0.127528307390263, "learning_rate": 6.912056213683001e-06, "loss": 0.2772, "step": 1945 }, { "epoch": 0.8774561271936403, "grad_norm": 0.08449830219805671, "learning_rate": 6.893894428052881e-06, "loss": 0.3331, "step": 1950 }, { "epoch": 0.879706014699265, "grad_norm": 0.10290918076564952, "learning_rate": 6.875703421619263e-06, "loss": 0.3162, "step": 1955 }, { "epoch": 0.8819559022048897, "grad_norm": 0.09666086595549915, "learning_rate": 6.85748347505029e-06, "loss": 0.3393, "step": 1960 }, { "epoch": 0.8842057897105144, "grad_norm": 0.09126192537758601, "learning_rate": 6.839234869460614e-06, "loss": 0.3313, "step": 1965 }, { "epoch": 0.8864556772161392, "grad_norm": 0.09213126718219308, "learning_rate": 6.820957886407068e-06, "loss": 0.3298, "step": 1970 }, { "epoch": 0.8887055647217639, "grad_norm": 0.0893744576312266, "learning_rate": 6.802652807884322e-06, "loss": 0.3258, "step": 1975 }, { "epoch": 0.8909554522273886, "grad_norm": 0.10520537204979115, "learning_rate": 6.784319916320528e-06, "loss": 0.3152, "step": 1980 }, { "epoch": 0.8932053397330133, "grad_norm": 0.09224246726284402, "learning_rate": 6.765959494572959e-06, "loss": 0.3176, "step": 1985 }, { "epoch": 0.895455227238638, "grad_norm": 0.09179694827419689, "learning_rate": 6.74757182592366e-06, "loss": 0.34, "step": 1990 }, { "epoch": 0.8977051147442627, "grad_norm": 0.10131034789212955, "learning_rate": 6.7291571940750575e-06, "loss": 0.3171, "step": 1995 }, { "epoch": 0.8999550022498876, "grad_norm": 0.11052424709399664, "learning_rate": 6.710715883145599e-06, "loss": 0.3084, "step": 2000 }, { "epoch": 0.9022048897555123, "grad_norm": 0.09523315367515199, "learning_rate": 6.692248177665357e-06, "loss": 0.3127, "step": 2005 }, { "epoch": 0.904454777261137, "grad_norm": 0.09774145840636202, "learning_rate": 6.673754362571646e-06, "loss": 0.2866, "step": 2010 }, { "epoch": 0.9067046647667617, "grad_norm": 0.1231628868544864, "learning_rate": 6.6552347232046255e-06, "loss": 0.2926, "step": 2015 }, { "epoch": 0.9089545522723864, "grad_norm": 0.09563379874509359, "learning_rate": 6.636689545302898e-06, "loss": 0.3128, "step": 2020 }, { "epoch": 0.9112044397780111, "grad_norm": 0.07820421786999905, "learning_rate": 6.6181191149990905e-06, "loss": 0.321, "step": 2025 }, { "epoch": 0.9134543272836358, "grad_norm": 0.10476028051810904, "learning_rate": 6.599523718815461e-06, "loss": 0.2836, "step": 2030 }, { "epoch": 0.9157042147892606, "grad_norm": 0.11389284533738375, "learning_rate": 6.580903643659453e-06, "loss": 0.2934, "step": 2035 }, { "epoch": 0.9179541022948853, "grad_norm": 0.10996849745288242, "learning_rate": 6.5622591768192875e-06, "loss": 0.3243, "step": 2040 }, { "epoch": 0.92020398980051, "grad_norm": 0.09512165946660596, "learning_rate": 6.5435906059595215e-06, "loss": 0.3081, "step": 2045 }, { "epoch": 0.9224538773061347, "grad_norm": 0.10421356775522515, "learning_rate": 6.524898219116612e-06, "loss": 0.2682, "step": 2050 }, { "epoch": 0.9247037648117594, "grad_norm": 0.10201698883401172, "learning_rate": 6.5061823046944694e-06, "loss": 0.2909, "step": 2055 }, { "epoch": 0.9269536523173841, "grad_norm": 0.10974937304411288, "learning_rate": 6.4874431514600146e-06, "loss": 0.3072, "step": 2060 }, { "epoch": 0.9292035398230089, "grad_norm": 0.09276233118456312, "learning_rate": 6.468681048538715e-06, "loss": 0.2989, "step": 2065 }, { "epoch": 0.9314534273286336, "grad_norm": 0.11862538493837348, "learning_rate": 6.44989628541013e-06, "loss": 0.3372, "step": 2070 }, { "epoch": 0.9337033148342583, "grad_norm": 0.10451521274212297, "learning_rate": 6.431089151903439e-06, "loss": 0.3188, "step": 2075 }, { "epoch": 0.935953202339883, "grad_norm": 0.11422644044073009, "learning_rate": 6.412259938192978e-06, "loss": 0.307, "step": 2080 }, { "epoch": 0.9382030898455077, "grad_norm": 0.14091820208432657, "learning_rate": 6.393408934793752e-06, "loss": 0.3546, "step": 2085 }, { "epoch": 0.9404529773511324, "grad_norm": 0.11829750564224563, "learning_rate": 6.374536432556963e-06, "loss": 0.3267, "step": 2090 }, { "epoch": 0.9427028648567571, "grad_norm": 0.11528106197624186, "learning_rate": 6.355642722665512e-06, "loss": 0.3203, "step": 2095 }, { "epoch": 0.9449527523623819, "grad_norm": 0.09372673822212164, "learning_rate": 6.336728096629517e-06, "loss": 0.3151, "step": 2100 }, { "epoch": 0.9472026398680066, "grad_norm": 0.10779896033185006, "learning_rate": 6.317792846281805e-06, "loss": 0.3052, "step": 2105 }, { "epoch": 0.9494525273736313, "grad_norm": 0.09672862996353586, "learning_rate": 6.298837263773423e-06, "loss": 0.3033, "step": 2110 }, { "epoch": 0.951702414879256, "grad_norm": 0.10872396340925997, "learning_rate": 6.2798616415691095e-06, "loss": 0.3002, "step": 2115 }, { "epoch": 0.9539523023848807, "grad_norm": 0.11829489090483326, "learning_rate": 6.260866272442807e-06, "loss": 0.2929, "step": 2120 }, { "epoch": 0.9562021898905054, "grad_norm": 0.11145672561455416, "learning_rate": 6.2418514494731245e-06, "loss": 0.2808, "step": 2125 }, { "epoch": 0.9584520773961301, "grad_norm": 0.1056896163271936, "learning_rate": 6.222817466038824e-06, "loss": 0.2841, "step": 2130 }, { "epoch": 0.9607019649017549, "grad_norm": 0.10666373036314321, "learning_rate": 6.2037646158142975e-06, "loss": 0.3005, "step": 2135 }, { "epoch": 0.9629518524073797, "grad_norm": 0.10697096904271322, "learning_rate": 6.184693192765028e-06, "loss": 0.2894, "step": 2140 }, { "epoch": 0.9652017399130044, "grad_norm": 0.17157045181184577, "learning_rate": 6.165603491143057e-06, "loss": 0.3298, "step": 2145 }, { "epoch": 0.9674516274186291, "grad_norm": 0.1005745666451797, "learning_rate": 6.146495805482451e-06, "loss": 0.3196, "step": 2150 }, { "epoch": 0.9697015149242538, "grad_norm": 0.139307317568223, "learning_rate": 6.127370430594745e-06, "loss": 0.2993, "step": 2155 }, { "epoch": 0.9719514024298785, "grad_norm": 0.11791582586234053, "learning_rate": 6.108227661564401e-06, "loss": 0.3083, "step": 2160 }, { "epoch": 0.9742012899355033, "grad_norm": 0.11233522118086736, "learning_rate": 6.089067793744258e-06, "loss": 0.3137, "step": 2165 }, { "epoch": 0.976451177441128, "grad_norm": 0.12524898605746265, "learning_rate": 6.069891122750971e-06, "loss": 0.2825, "step": 2170 }, { "epoch": 0.9787010649467527, "grad_norm": 0.09825541745527079, "learning_rate": 6.050697944460444e-06, "loss": 0.3146, "step": 2175 }, { "epoch": 0.9809509524523774, "grad_norm": 0.11637412785681134, "learning_rate": 6.0314885550032796e-06, "loss": 0.2935, "step": 2180 }, { "epoch": 0.9832008399580021, "grad_norm": 0.10398981333232891, "learning_rate": 6.012263250760199e-06, "loss": 0.28, "step": 2185 }, { "epoch": 0.9854507274636268, "grad_norm": 0.1347409630178848, "learning_rate": 5.993022328357466e-06, "loss": 0.2899, "step": 2190 }, { "epoch": 0.9877006149692515, "grad_norm": 0.136591408837683, "learning_rate": 5.973766084662324e-06, "loss": 0.2729, "step": 2195 }, { "epoch": 0.9899505024748763, "grad_norm": 0.1032954692332516, "learning_rate": 5.954494816778408e-06, "loss": 0.3106, "step": 2200 }, { "epoch": 0.992200389980501, "grad_norm": 0.12420490530861028, "learning_rate": 5.935208822041152e-06, "loss": 0.2699, "step": 2205 }, { "epoch": 0.9944502774861257, "grad_norm": 0.10146757951487546, "learning_rate": 5.915908398013217e-06, "loss": 0.266, "step": 2210 }, { "epoch": 0.9967001649917504, "grad_norm": 0.10690509046474422, "learning_rate": 5.896593842479893e-06, "loss": 0.2916, "step": 2215 }, { "epoch": 0.9989500524973751, "grad_norm": 0.2098417588495756, "learning_rate": 5.8772654534445e-06, "loss": 0.3104, "step": 2220 }, { "epoch": 1.0, "eval_loss": 0.27543845772743225, "eval_runtime": 55.028, "eval_samples_per_second": 19.59, "eval_steps_per_second": 4.907, "step": 2223 }, { "epoch": 1.00089995500225, "grad_norm": 0.11100179968154768, "learning_rate": 5.857923529123799e-06, "loss": 0.2341, "step": 2225 }, { "epoch": 1.0031498425078746, "grad_norm": 0.14616860643517418, "learning_rate": 5.838568367943383e-06, "loss": 0.2679, "step": 2230 }, { "epoch": 1.0053997300134994, "grad_norm": 0.11313230544533252, "learning_rate": 5.819200268533076e-06, "loss": 0.2873, "step": 2235 }, { "epoch": 1.007649617519124, "grad_norm": 0.1210465260044826, "learning_rate": 5.7998195297223285e-06, "loss": 0.2677, "step": 2240 }, { "epoch": 1.0098995050247488, "grad_norm": 0.11722674843174795, "learning_rate": 5.7804264505356e-06, "loss": 0.2548, "step": 2245 }, { "epoch": 1.0121493925303735, "grad_norm": 0.12390544554268877, "learning_rate": 5.76102133018775e-06, "loss": 0.2942, "step": 2250 }, { "epoch": 1.0143992800359982, "grad_norm": 0.14215352813872506, "learning_rate": 5.741604468079421e-06, "loss": 0.3095, "step": 2255 }, { "epoch": 1.016649167541623, "grad_norm": 0.13309421360381032, "learning_rate": 5.72217616379242e-06, "loss": 0.2794, "step": 2260 }, { "epoch": 1.0188990550472476, "grad_norm": 0.1409784002692586, "learning_rate": 5.702736717085093e-06, "loss": 0.2998, "step": 2265 }, { "epoch": 1.0211489425528724, "grad_norm": 0.12978570417210325, "learning_rate": 5.6832864278876984e-06, "loss": 0.2829, "step": 2270 }, { "epoch": 1.023398830058497, "grad_norm": 0.10750959417123264, "learning_rate": 5.663825596297794e-06, "loss": 0.2902, "step": 2275 }, { "epoch": 1.0256487175641218, "grad_norm": 0.137940819760974, "learning_rate": 5.644354522575581e-06, "loss": 0.2806, "step": 2280 }, { "epoch": 1.0278986050697465, "grad_norm": 0.14563829553392096, "learning_rate": 5.624873507139297e-06, "loss": 0.277, "step": 2285 }, { "epoch": 1.0301484925753712, "grad_norm": 0.12377796525725795, "learning_rate": 5.605382850560565e-06, "loss": 0.2943, "step": 2290 }, { "epoch": 1.032398380080996, "grad_norm": 0.16984305955909604, "learning_rate": 5.585882853559762e-06, "loss": 0.2889, "step": 2295 }, { "epoch": 1.0346482675866207, "grad_norm": 0.1281002826955631, "learning_rate": 5.566373817001377e-06, "loss": 0.293, "step": 2300 }, { "epoch": 1.0368981550922454, "grad_norm": 0.15524678076001608, "learning_rate": 5.546856041889374e-06, "loss": 0.2605, "step": 2305 }, { "epoch": 1.03914804259787, "grad_norm": 0.14215571774039212, "learning_rate": 5.527329829362534e-06, "loss": 0.2786, "step": 2310 }, { "epoch": 1.0413979301034948, "grad_norm": 0.1447748028005779, "learning_rate": 5.5077954806898284e-06, "loss": 0.2688, "step": 2315 }, { "epoch": 1.0436478176091195, "grad_norm": 0.14426858307924748, "learning_rate": 5.488253297265757e-06, "loss": 0.2777, "step": 2320 }, { "epoch": 1.0458977051147442, "grad_norm": 0.1272869099382178, "learning_rate": 5.468703580605703e-06, "loss": 0.2997, "step": 2325 }, { "epoch": 1.048147592620369, "grad_norm": 0.133865100418296, "learning_rate": 5.4491466323412745e-06, "loss": 0.2839, "step": 2330 }, { "epoch": 1.0503974801259937, "grad_norm": 0.12437130432718715, "learning_rate": 5.429582754215664e-06, "loss": 0.2843, "step": 2335 }, { "epoch": 1.0526473676316184, "grad_norm": 0.1419352738893503, "learning_rate": 5.410012248078975e-06, "loss": 0.2677, "step": 2340 }, { "epoch": 1.054897255137243, "grad_norm": 0.1639413029064359, "learning_rate": 5.390435415883583e-06, "loss": 0.2805, "step": 2345 }, { "epoch": 1.0571471426428678, "grad_norm": 0.14750894149267404, "learning_rate": 5.370852559679461e-06, "loss": 0.2718, "step": 2350 }, { "epoch": 1.0593970301484925, "grad_norm": 0.1418143669594509, "learning_rate": 5.351263981609532e-06, "loss": 0.2374, "step": 2355 }, { "epoch": 1.0616469176541172, "grad_norm": 0.1467085192211227, "learning_rate": 5.331669983904996e-06, "loss": 0.278, "step": 2360 }, { "epoch": 1.063896805159742, "grad_norm": 0.1686191463372291, "learning_rate": 5.312070868880678e-06, "loss": 0.2818, "step": 2365 }, { "epoch": 1.0661466926653667, "grad_norm": 0.1844876464618337, "learning_rate": 5.29246693893035e-06, "loss": 0.2971, "step": 2370 }, { "epoch": 1.0683965801709914, "grad_norm": 0.12521919673631507, "learning_rate": 5.272858496522084e-06, "loss": 0.2737, "step": 2375 }, { "epoch": 1.070646467676616, "grad_norm": 0.15034047715143825, "learning_rate": 5.253245844193564e-06, "loss": 0.2858, "step": 2380 }, { "epoch": 1.0728963551822408, "grad_norm": 0.1264075738033277, "learning_rate": 5.233629284547435e-06, "loss": 0.2564, "step": 2385 }, { "epoch": 1.0751462426878655, "grad_norm": 0.1940932983786269, "learning_rate": 5.214009120246623e-06, "loss": 0.2722, "step": 2390 }, { "epoch": 1.0773961301934902, "grad_norm": 0.16922904631843647, "learning_rate": 5.1943856540096795e-06, "loss": 0.2912, "step": 2395 }, { "epoch": 1.079646017699115, "grad_norm": 0.23716139744779294, "learning_rate": 5.174759188606087e-06, "loss": 0.2885, "step": 2400 }, { "epoch": 1.0818959052047397, "grad_norm": 0.11509288529342813, "learning_rate": 5.155130026851616e-06, "loss": 0.2575, "step": 2405 }, { "epoch": 1.0841457927103644, "grad_norm": 0.17727493415132747, "learning_rate": 5.135498471603629e-06, "loss": 0.2639, "step": 2410 }, { "epoch": 1.0863956802159893, "grad_norm": 0.13930766876349623, "learning_rate": 5.1158648257564235e-06, "loss": 0.2606, "step": 2415 }, { "epoch": 1.0886455677216138, "grad_norm": 0.12454839412933186, "learning_rate": 5.0962293922365495e-06, "loss": 0.256, "step": 2420 }, { "epoch": 1.0908954552272387, "grad_norm": 0.18809390149779476, "learning_rate": 5.076592473998141e-06, "loss": 0.2646, "step": 2425 }, { "epoch": 1.0931453427328635, "grad_norm": 0.1508834503375353, "learning_rate": 5.056954374018236e-06, "loss": 0.2764, "step": 2430 }, { "epoch": 1.0953952302384882, "grad_norm": 0.1491889266816844, "learning_rate": 5.037315395292111e-06, "loss": 0.2691, "step": 2435 }, { "epoch": 1.0976451177441129, "grad_norm": 0.15633034297704468, "learning_rate": 5.017675840828597e-06, "loss": 0.2657, "step": 2440 }, { "epoch": 1.0998950052497376, "grad_norm": 0.15782519717103635, "learning_rate": 4.998036013645409e-06, "loss": 0.2561, "step": 2445 }, { "epoch": 1.1021448927553623, "grad_norm": 0.19449808917352213, "learning_rate": 4.97839621676447e-06, "loss": 0.2571, "step": 2450 }, { "epoch": 1.104394780260987, "grad_norm": 0.16872055966750726, "learning_rate": 4.958756753207234e-06, "loss": 0.2459, "step": 2455 }, { "epoch": 1.1066446677666117, "grad_norm": 0.17373438335912267, "learning_rate": 4.939117925990013e-06, "loss": 0.2805, "step": 2460 }, { "epoch": 1.1088945552722365, "grad_norm": 0.15151992334964703, "learning_rate": 4.919480038119302e-06, "loss": 0.251, "step": 2465 }, { "epoch": 1.1111444427778612, "grad_norm": 0.15554454267536397, "learning_rate": 4.899843392587104e-06, "loss": 0.2533, "step": 2470 }, { "epoch": 1.113394330283486, "grad_norm": 0.16595294302301358, "learning_rate": 4.880208292366247e-06, "loss": 0.2864, "step": 2475 }, { "epoch": 1.1156442177891106, "grad_norm": 0.15038201249362013, "learning_rate": 4.860575040405726e-06, "loss": 0.2744, "step": 2480 }, { "epoch": 1.1178941052947353, "grad_norm": 0.16630054816008968, "learning_rate": 4.840943939626012e-06, "loss": 0.2362, "step": 2485 }, { "epoch": 1.12014399280036, "grad_norm": 0.18269629542973387, "learning_rate": 4.821315292914392e-06, "loss": 0.2786, "step": 2490 }, { "epoch": 1.1223938803059847, "grad_norm": 0.16417528180865418, "learning_rate": 4.801689403120282e-06, "loss": 0.2506, "step": 2495 }, { "epoch": 1.1246437678116095, "grad_norm": 0.12251195240813534, "learning_rate": 4.782066573050567e-06, "loss": 0.2693, "step": 2500 }, { "epoch": 1.1268936553172342, "grad_norm": 0.19913321021658195, "learning_rate": 4.7624471054649216e-06, "loss": 0.26, "step": 2505 }, { "epoch": 1.129143542822859, "grad_norm": 0.16359478594452095, "learning_rate": 4.742831303071143e-06, "loss": 0.2507, "step": 2510 }, { "epoch": 1.1313934303284836, "grad_norm": 0.20741074237045662, "learning_rate": 4.723219468520474e-06, "loss": 0.2678, "step": 2515 }, { "epoch": 1.1336433178341083, "grad_norm": 0.16956816625653676, "learning_rate": 4.703611904402939e-06, "loss": 0.2795, "step": 2520 }, { "epoch": 1.135893205339733, "grad_norm": 0.1818340434409631, "learning_rate": 4.684008913242679e-06, "loss": 0.2586, "step": 2525 }, { "epoch": 1.1381430928453578, "grad_norm": 0.17749209313732456, "learning_rate": 4.664410797493275e-06, "loss": 0.2708, "step": 2530 }, { "epoch": 1.1403929803509825, "grad_norm": 0.167827444506409, "learning_rate": 4.644817859533083e-06, "loss": 0.2717, "step": 2535 }, { "epoch": 1.1426428678566072, "grad_norm": 0.17149191797141825, "learning_rate": 4.625230401660578e-06, "loss": 0.2444, "step": 2540 }, { "epoch": 1.144892755362232, "grad_norm": 0.19053262323498327, "learning_rate": 4.605648726089674e-06, "loss": 0.2546, "step": 2545 }, { "epoch": 1.1471426428678566, "grad_norm": 0.17029611567515032, "learning_rate": 4.58607313494508e-06, "loss": 0.2515, "step": 2550 }, { "epoch": 1.1493925303734813, "grad_norm": 0.20535330778256622, "learning_rate": 4.566503930257624e-06, "loss": 0.2687, "step": 2555 }, { "epoch": 1.151642417879106, "grad_norm": 0.17888453950166083, "learning_rate": 4.546941413959595e-06, "loss": 0.2582, "step": 2560 }, { "epoch": 1.1538923053847308, "grad_norm": 0.17098481716726255, "learning_rate": 4.5273858878800895e-06, "loss": 0.2633, "step": 2565 }, { "epoch": 1.1561421928903555, "grad_norm": 0.22394541422414396, "learning_rate": 4.507837653740355e-06, "loss": 0.2657, "step": 2570 }, { "epoch": 1.1583920803959802, "grad_norm": 0.16148745686481833, "learning_rate": 4.4882970131491286e-06, "loss": 0.2469, "step": 2575 }, { "epoch": 1.160641967901605, "grad_norm": 0.21762812124764483, "learning_rate": 4.468764267597986e-06, "loss": 0.2815, "step": 2580 }, { "epoch": 1.1628918554072296, "grad_norm": 0.2041647572323139, "learning_rate": 4.449239718456696e-06, "loss": 0.253, "step": 2585 }, { "epoch": 1.1651417429128543, "grad_norm": 0.1508182234886033, "learning_rate": 4.429723666968559e-06, "loss": 0.2532, "step": 2590 }, { "epoch": 1.167391630418479, "grad_norm": 0.22173731592066487, "learning_rate": 4.410216414245771e-06, "loss": 0.2597, "step": 2595 }, { "epoch": 1.1696415179241038, "grad_norm": 0.15334607029538722, "learning_rate": 4.390718261264768e-06, "loss": 0.2429, "step": 2600 }, { "epoch": 1.1718914054297285, "grad_norm": 0.17386719805484463, "learning_rate": 4.371229508861588e-06, "loss": 0.2718, "step": 2605 }, { "epoch": 1.1741412929353532, "grad_norm": 0.255145373819277, "learning_rate": 4.351750457727229e-06, "loss": 0.2544, "step": 2610 }, { "epoch": 1.176391180440978, "grad_norm": 0.19091868423027997, "learning_rate": 4.332281408403011e-06, "loss": 0.26, "step": 2615 }, { "epoch": 1.1786410679466026, "grad_norm": 0.17031635023758315, "learning_rate": 4.312822661275929e-06, "loss": 0.2478, "step": 2620 }, { "epoch": 1.1808909554522273, "grad_norm": 0.18810141305157912, "learning_rate": 4.293374516574031e-06, "loss": 0.2593, "step": 2625 }, { "epoch": 1.183140842957852, "grad_norm": 0.20489249951929697, "learning_rate": 4.273937274361782e-06, "loss": 0.2226, "step": 2630 }, { "epoch": 1.1853907304634768, "grad_norm": 0.18589998495363094, "learning_rate": 4.254511234535432e-06, "loss": 0.2313, "step": 2635 }, { "epoch": 1.1876406179691015, "grad_norm": 0.1974695166475231, "learning_rate": 4.235096696818385e-06, "loss": 0.2782, "step": 2640 }, { "epoch": 1.1898905054747262, "grad_norm": 0.15560807641673985, "learning_rate": 4.215693960756586e-06, "loss": 0.2461, "step": 2645 }, { "epoch": 1.192140392980351, "grad_norm": 0.14168460680781833, "learning_rate": 4.1963033257138904e-06, "loss": 0.2323, "step": 2650 }, { "epoch": 1.1943902804859756, "grad_norm": 0.19193101382035213, "learning_rate": 4.176925090867449e-06, "loss": 0.252, "step": 2655 }, { "epoch": 1.1966401679916003, "grad_norm": 0.19059681316908272, "learning_rate": 4.157559555203086e-06, "loss": 0.2237, "step": 2660 }, { "epoch": 1.198890055497225, "grad_norm": 0.18365584045782385, "learning_rate": 4.138207017510696e-06, "loss": 0.2498, "step": 2665 }, { "epoch": 1.2011399430028498, "grad_norm": 0.17126185601849214, "learning_rate": 4.118867776379624e-06, "loss": 0.2121, "step": 2670 }, { "epoch": 1.2033898305084745, "grad_norm": 0.23530086737062514, "learning_rate": 4.099542130194069e-06, "loss": 0.2369, "step": 2675 }, { "epoch": 1.2056397180140994, "grad_norm": 0.1759441387313428, "learning_rate": 4.0802303771284685e-06, "loss": 0.2171, "step": 2680 }, { "epoch": 1.207889605519724, "grad_norm": 0.19878924933956027, "learning_rate": 4.060932815142904e-06, "loss": 0.2631, "step": 2685 }, { "epoch": 1.2101394930253488, "grad_norm": 0.1984620336427276, "learning_rate": 4.041649741978508e-06, "loss": 0.2408, "step": 2690 }, { "epoch": 1.2123893805309733, "grad_norm": 0.16406158411947314, "learning_rate": 4.022381455152863e-06, "loss": 0.2204, "step": 2695 }, { "epoch": 1.2146392680365983, "grad_norm": 0.18585134324802086, "learning_rate": 4.003128251955412e-06, "loss": 0.2254, "step": 2700 }, { "epoch": 1.2168891555422228, "grad_norm": 0.2028470417783533, "learning_rate": 3.983890429442876e-06, "loss": 0.2174, "step": 2705 }, { "epoch": 1.2191390430478477, "grad_norm": 0.18306124060212872, "learning_rate": 3.964668284434666e-06, "loss": 0.2281, "step": 2710 }, { "epoch": 1.2213889305534724, "grad_norm": 0.2261111639681813, "learning_rate": 3.945462113508312e-06, "loss": 0.2183, "step": 2715 }, { "epoch": 1.2236388180590971, "grad_norm": 0.21171892163095699, "learning_rate": 3.92627221299487e-06, "loss": 0.2249, "step": 2720 }, { "epoch": 1.2258887055647218, "grad_norm": 0.18554866614076224, "learning_rate": 3.907098878974367e-06, "loss": 0.2356, "step": 2725 }, { "epoch": 1.2281385930703466, "grad_norm": 0.17740452807380613, "learning_rate": 3.887942407271228e-06, "loss": 0.2213, "step": 2730 }, { "epoch": 1.2303884805759713, "grad_norm": 0.19628544094095077, "learning_rate": 3.868803093449709e-06, "loss": 0.2256, "step": 2735 }, { "epoch": 1.232638368081596, "grad_norm": 0.1906710395370276, "learning_rate": 3.8496812328093335e-06, "loss": 0.2431, "step": 2740 }, { "epoch": 1.2348882555872207, "grad_norm": 0.21739799246928065, "learning_rate": 3.8305771203803434e-06, "loss": 0.2053, "step": 2745 }, { "epoch": 1.2371381430928454, "grad_norm": 0.20933189544262915, "learning_rate": 3.8114910509191483e-06, "loss": 0.2372, "step": 2750 }, { "epoch": 1.2393880305984701, "grad_norm": 0.16805864711967494, "learning_rate": 3.7924233189037697e-06, "loss": 0.2421, "step": 2755 }, { "epoch": 1.2416379181040949, "grad_norm": 0.23407049517629622, "learning_rate": 3.773374218529298e-06, "loss": 0.2289, "step": 2760 }, { "epoch": 1.2438878056097196, "grad_norm": 0.20043628906146582, "learning_rate": 3.7543440437033656e-06, "loss": 0.2197, "step": 2765 }, { "epoch": 1.2461376931153443, "grad_norm": 0.1811301883423287, "learning_rate": 3.7353330880415963e-06, "loss": 0.2118, "step": 2770 }, { "epoch": 1.248387580620969, "grad_norm": 0.20413664615759625, "learning_rate": 3.7163416448630886e-06, "loss": 0.2103, "step": 2775 }, { "epoch": 1.2506374681265937, "grad_norm": 0.2053294418375065, "learning_rate": 3.6973700071858764e-06, "loss": 0.2265, "step": 2780 }, { "epoch": 1.2528873556322184, "grad_norm": 0.17855437216730508, "learning_rate": 3.6784184677224204e-06, "loss": 0.2082, "step": 2785 }, { "epoch": 1.2551372431378431, "grad_norm": 0.21204933584524724, "learning_rate": 3.659487318875087e-06, "loss": 0.2368, "step": 2790 }, { "epoch": 1.2573871306434679, "grad_norm": 0.244934854739885, "learning_rate": 3.6405768527316376e-06, "loss": 0.2236, "step": 2795 }, { "epoch": 1.2596370181490926, "grad_norm": 0.20352719384257717, "learning_rate": 3.6216873610607155e-06, "loss": 0.2127, "step": 2800 }, { "epoch": 1.2618869056547173, "grad_norm": 0.21525625357885447, "learning_rate": 3.602819135307355e-06, "loss": 0.2026, "step": 2805 }, { "epoch": 1.264136793160342, "grad_norm": 0.24886200931475094, "learning_rate": 3.58397246658848e-06, "loss": 0.2049, "step": 2810 }, { "epoch": 1.2663866806659667, "grad_norm": 0.22213048059657176, "learning_rate": 3.5651476456884103e-06, "loss": 0.2149, "step": 2815 }, { "epoch": 1.2686365681715914, "grad_norm": 0.24474792019196667, "learning_rate": 3.5463449630543744e-06, "loss": 0.2176, "step": 2820 }, { "epoch": 1.2708864556772161, "grad_norm": 0.21959268792414904, "learning_rate": 3.527564708792035e-06, "loss": 0.2319, "step": 2825 }, { "epoch": 1.2731363431828409, "grad_norm": 0.21285142665025264, "learning_rate": 3.508807172661006e-06, "loss": 0.2278, "step": 2830 }, { "epoch": 1.2753862306884656, "grad_norm": 0.24872484432655345, "learning_rate": 3.490072644070386e-06, "loss": 0.2367, "step": 2835 }, { "epoch": 1.2776361181940903, "grad_norm": 0.2446892197957464, "learning_rate": 3.47136141207429e-06, "loss": 0.2147, "step": 2840 }, { "epoch": 1.279886005699715, "grad_norm": 0.3593552477933211, "learning_rate": 3.452673765367389e-06, "loss": 0.2471, "step": 2845 }, { "epoch": 1.2821358932053397, "grad_norm": 0.18760658096432373, "learning_rate": 3.4340099922804627e-06, "loss": 0.2185, "step": 2850 }, { "epoch": 1.2843857807109644, "grad_norm": 0.1746094898464911, "learning_rate": 3.4153703807759432e-06, "loss": 0.1939, "step": 2855 }, { "epoch": 1.2866356682165891, "grad_norm": 0.2386232051443061, "learning_rate": 3.3967552184434753e-06, "loss": 0.2182, "step": 2860 }, { "epoch": 1.2888855557222139, "grad_norm": 0.2147456869413775, "learning_rate": 3.378164792495475e-06, "loss": 0.2232, "step": 2865 }, { "epoch": 1.2911354432278386, "grad_norm": 0.21939888824914258, "learning_rate": 3.3595993897627098e-06, "loss": 0.2059, "step": 2870 }, { "epoch": 1.2933853307334633, "grad_norm": 0.20007621997926173, "learning_rate": 3.3410592966898565e-06, "loss": 0.2025, "step": 2875 }, { "epoch": 1.295635218239088, "grad_norm": 0.22959303011889556, "learning_rate": 3.3225447993310983e-06, "loss": 0.2004, "step": 2880 }, { "epoch": 1.2978851057447127, "grad_norm": 0.23309801112874845, "learning_rate": 3.3040561833456964e-06, "loss": 0.1914, "step": 2885 }, { "epoch": 1.3001349932503374, "grad_norm": 0.22848735574436602, "learning_rate": 3.2855937339935933e-06, "loss": 0.1844, "step": 2890 }, { "epoch": 1.3023848807559621, "grad_norm": 0.20570875834144497, "learning_rate": 3.2671577361310087e-06, "loss": 0.2132, "step": 2895 }, { "epoch": 1.3046347682615869, "grad_norm": 0.24826968315533732, "learning_rate": 3.2487484742060427e-06, "loss": 0.2111, "step": 2900 }, { "epoch": 1.3068846557672116, "grad_norm": 0.2410762961266627, "learning_rate": 3.2303662322542835e-06, "loss": 0.1948, "step": 2905 }, { "epoch": 1.3091345432728363, "grad_norm": 0.23347593077480983, "learning_rate": 3.212011293894436e-06, "loss": 0.2008, "step": 2910 }, { "epoch": 1.311384430778461, "grad_norm": 0.21360507603920142, "learning_rate": 3.1936839423239376e-06, "loss": 0.2042, "step": 2915 }, { "epoch": 1.3136343182840857, "grad_norm": 0.21960761516089436, "learning_rate": 3.1753844603145894e-06, "loss": 0.2391, "step": 2920 }, { "epoch": 1.3158842057897104, "grad_norm": 0.20203229318870164, "learning_rate": 3.1571131302081916e-06, "loss": 0.1876, "step": 2925 }, { "epoch": 1.3181340932953352, "grad_norm": 0.24191918555495237, "learning_rate": 3.138870233912197e-06, "loss": 0.1962, "step": 2930 }, { "epoch": 1.32038398080096, "grad_norm": 0.20921020378628946, "learning_rate": 3.1206560528953467e-06, "loss": 0.2058, "step": 2935 }, { "epoch": 1.3226338683065846, "grad_norm": 0.20869954401470014, "learning_rate": 3.102470868183344e-06, "loss": 0.2064, "step": 2940 }, { "epoch": 1.3248837558122095, "grad_norm": 0.2249649340119077, "learning_rate": 3.084314960354501e-06, "loss": 0.2046, "step": 2945 }, { "epoch": 1.327133643317834, "grad_norm": 0.25182875069609073, "learning_rate": 3.066188609535421e-06, "loss": 0.2037, "step": 2950 }, { "epoch": 1.329383530823459, "grad_norm": 0.21107658603026272, "learning_rate": 3.0480920953966786e-06, "loss": 0.2094, "step": 2955 }, { "epoch": 1.3316334183290834, "grad_norm": 0.22913188913268076, "learning_rate": 3.0300256971484943e-06, "loss": 0.2162, "step": 2960 }, { "epoch": 1.3338833058347084, "grad_norm": 0.2539850632246194, "learning_rate": 3.0119896935364305e-06, "loss": 0.1941, "step": 2965 }, { "epoch": 1.3361331933403329, "grad_norm": 0.23858971814994895, "learning_rate": 2.993984362837098e-06, "loss": 0.1839, "step": 2970 }, { "epoch": 1.3383830808459578, "grad_norm": 0.2721885562055672, "learning_rate": 2.9760099828538545e-06, "loss": 0.211, "step": 2975 }, { "epoch": 1.3406329683515823, "grad_norm": 0.2482373958173057, "learning_rate": 2.9580668309125203e-06, "loss": 0.1998, "step": 2980 }, { "epoch": 1.3428828558572072, "grad_norm": 0.2306376622557913, "learning_rate": 2.940155183857096e-06, "loss": 0.2196, "step": 2985 }, { "epoch": 1.3451327433628317, "grad_norm": 0.26262612593731016, "learning_rate": 2.922275318045502e-06, "loss": 0.1882, "step": 2990 }, { "epoch": 1.3473826308684567, "grad_norm": 0.20346419756531464, "learning_rate": 2.9044275093453034e-06, "loss": 0.193, "step": 2995 }, { "epoch": 1.3496325183740812, "grad_norm": 0.22993902000452152, "learning_rate": 2.8866120331294567e-06, "loss": 0.1736, "step": 3000 }, { "epoch": 1.351882405879706, "grad_norm": 0.27935117403868454, "learning_rate": 2.8688291642720656e-06, "loss": 0.1904, "step": 3005 }, { "epoch": 1.3541322933853308, "grad_norm": 0.27365571988160076, "learning_rate": 2.8510791771441327e-06, "loss": 0.1853, "step": 3010 }, { "epoch": 1.3563821808909555, "grad_norm": 0.23083646098925237, "learning_rate": 2.8333623456093313e-06, "loss": 0.1968, "step": 3015 }, { "epoch": 1.3586320683965802, "grad_norm": 0.2704413392632432, "learning_rate": 2.815678943019784e-06, "loss": 0.1927, "step": 3020 }, { "epoch": 1.360881955902205, "grad_norm": 0.24524940743516008, "learning_rate": 2.7980292422118282e-06, "loss": 0.2051, "step": 3025 }, { "epoch": 1.3631318434078297, "grad_norm": 0.24409905829083706, "learning_rate": 2.7804135155018307e-06, "loss": 0.164, "step": 3030 }, { "epoch": 1.3653817309134544, "grad_norm": 0.23814624701122666, "learning_rate": 2.762832034681965e-06, "loss": 0.1777, "step": 3035 }, { "epoch": 1.367631618419079, "grad_norm": 0.24368144586742516, "learning_rate": 2.7452850710160305e-06, "loss": 0.1946, "step": 3040 }, { "epoch": 1.3698815059247038, "grad_norm": 0.24657044578855591, "learning_rate": 2.727772895235262e-06, "loss": 0.2024, "step": 3045 }, { "epoch": 1.3721313934303285, "grad_norm": 0.20668089980394588, "learning_rate": 2.710295777534154e-06, "loss": 0.1853, "step": 3050 }, { "epoch": 1.3743812809359532, "grad_norm": 0.22601090907078772, "learning_rate": 2.692853987566291e-06, "loss": 0.1764, "step": 3055 }, { "epoch": 1.376631168441578, "grad_norm": 0.2872809956397954, "learning_rate": 2.675447794440188e-06, "loss": 0.1609, "step": 3060 }, { "epoch": 1.3788810559472027, "grad_norm": 0.2411498866283728, "learning_rate": 2.658077466715138e-06, "loss": 0.1813, "step": 3065 }, { "epoch": 1.3811309434528274, "grad_norm": 0.19284001830686515, "learning_rate": 2.6407432723970694e-06, "loss": 0.1751, "step": 3070 }, { "epoch": 1.383380830958452, "grad_norm": 0.23962546665483148, "learning_rate": 2.6234454789344067e-06, "loss": 0.164, "step": 3075 }, { "epoch": 1.3856307184640768, "grad_norm": 0.25424115274026465, "learning_rate": 2.6061843532139563e-06, "loss": 0.1816, "step": 3080 }, { "epoch": 1.3878806059697015, "grad_norm": 0.25313051025088457, "learning_rate": 2.5889601615567657e-06, "loss": 0.1813, "step": 3085 }, { "epoch": 1.3901304934753262, "grad_norm": 0.2378148779779353, "learning_rate": 2.5717731697140425e-06, "loss": 0.1822, "step": 3090 }, { "epoch": 1.392380380980951, "grad_norm": 0.24347808422412195, "learning_rate": 2.554623642863031e-06, "loss": 0.165, "step": 3095 }, { "epoch": 1.3946302684865757, "grad_norm": 0.26898645682575706, "learning_rate": 2.5375118456029345e-06, "loss": 0.1834, "step": 3100 }, { "epoch": 1.3968801559922004, "grad_norm": 0.20720842966338204, "learning_rate": 2.520438041950827e-06, "loss": 0.1638, "step": 3105 }, { "epoch": 1.399130043497825, "grad_norm": 0.17953407898094143, "learning_rate": 2.503402495337579e-06, "loss": 0.1547, "step": 3110 }, { "epoch": 1.4013799310034498, "grad_norm": 0.27527731902583097, "learning_rate": 2.4864054686037993e-06, "loss": 0.1771, "step": 3115 }, { "epoch": 1.4036298185090745, "grad_norm": 0.2441103318991431, "learning_rate": 2.469447223995772e-06, "loss": 0.1866, "step": 3120 }, { "epoch": 1.4058797060146992, "grad_norm": 0.22146555295977546, "learning_rate": 2.452528023161414e-06, "loss": 0.1658, "step": 3125 }, { "epoch": 1.408129593520324, "grad_norm": 0.24036023292266545, "learning_rate": 2.4356481271462396e-06, "loss": 0.1951, "step": 3130 }, { "epoch": 1.4103794810259487, "grad_norm": 0.2346343428333368, "learning_rate": 2.4188077963893276e-06, "loss": 0.1724, "step": 3135 }, { "epoch": 1.4126293685315734, "grad_norm": 0.28341682436165366, "learning_rate": 2.4020072907193123e-06, "loss": 0.1786, "step": 3140 }, { "epoch": 1.414879256037198, "grad_norm": 0.25526003142103276, "learning_rate": 2.3852468693503635e-06, "loss": 0.166, "step": 3145 }, { "epoch": 1.4171291435428228, "grad_norm": 0.21169312639212728, "learning_rate": 2.3685267908781934e-06, "loss": 0.1651, "step": 3150 }, { "epoch": 1.4193790310484475, "grad_norm": 0.2311680518850515, "learning_rate": 2.3518473132760668e-06, "loss": 0.1943, "step": 3155 }, { "epoch": 1.4216289185540723, "grad_norm": 0.2466100225354571, "learning_rate": 2.335208693890819e-06, "loss": 0.1759, "step": 3160 }, { "epoch": 1.423878806059697, "grad_norm": 0.25320147361029777, "learning_rate": 2.318611189438884e-06, "loss": 0.1741, "step": 3165 }, { "epoch": 1.4261286935653217, "grad_norm": 0.20379034599277449, "learning_rate": 2.30205505600234e-06, "loss": 0.1515, "step": 3170 }, { "epoch": 1.4283785810709464, "grad_norm": 0.2171396909921854, "learning_rate": 2.2855405490249498e-06, "loss": 0.17, "step": 3175 }, { "epoch": 1.430628468576571, "grad_norm": 0.25025327769749556, "learning_rate": 2.2690679233082237e-06, "loss": 0.1442, "step": 3180 }, { "epoch": 1.4328783560821958, "grad_norm": 0.20134411154173665, "learning_rate": 2.2526374330074945e-06, "loss": 0.172, "step": 3185 }, { "epoch": 1.4351282435878205, "grad_norm": 0.20636498909519851, "learning_rate": 2.23624933162798e-06, "loss": 0.1663, "step": 3190 }, { "epoch": 1.4373781310934453, "grad_norm": 0.24524283669244562, "learning_rate": 2.219903872020885e-06, "loss": 0.1726, "step": 3195 }, { "epoch": 1.43962801859907, "grad_norm": 0.226380159375995, "learning_rate": 2.2036013063795024e-06, "loss": 0.1707, "step": 3200 }, { "epoch": 1.4418779061046947, "grad_norm": 0.23433271109204132, "learning_rate": 2.1873418862353095e-06, "loss": 0.1885, "step": 3205 }, { "epoch": 1.4441277936103196, "grad_norm": 0.27219913617478064, "learning_rate": 2.1711258624540955e-06, "loss": 0.1627, "step": 3210 }, { "epoch": 1.4463776811159441, "grad_norm": 0.2720555642584958, "learning_rate": 2.15495348523209e-06, "loss": 0.1637, "step": 3215 }, { "epoch": 1.448627568621569, "grad_norm": 0.26833619371988116, "learning_rate": 2.1388250040921007e-06, "loss": 0.1536, "step": 3220 }, { "epoch": 1.4508774561271935, "grad_norm": 0.2603614911120465, "learning_rate": 2.1227406678796664e-06, "loss": 0.1608, "step": 3225 }, { "epoch": 1.4531273436328185, "grad_norm": 0.21162975240993986, "learning_rate": 2.1067007247592153e-06, "loss": 0.1649, "step": 3230 }, { "epoch": 1.455377231138443, "grad_norm": 0.240416434380939, "learning_rate": 2.0907054222102367e-06, "loss": 0.157, "step": 3235 }, { "epoch": 1.457627118644068, "grad_norm": 0.22619962147371023, "learning_rate": 2.074755007023461e-06, "loss": 0.152, "step": 3240 }, { "epoch": 1.4598770061496924, "grad_norm": 0.23122000932974787, "learning_rate": 2.058849725297061e-06, "loss": 0.1674, "step": 3245 }, { "epoch": 1.4621268936553173, "grad_norm": 0.20088454332338435, "learning_rate": 2.042989822432837e-06, "loss": 0.1426, "step": 3250 }, { "epoch": 1.4643767811609418, "grad_norm": 0.22446161891702227, "learning_rate": 2.0271755431324456e-06, "loss": 0.1431, "step": 3255 }, { "epoch": 1.4666266686665668, "grad_norm": 0.25951357745647785, "learning_rate": 2.011407131393624e-06, "loss": 0.1521, "step": 3260 }, { "epoch": 1.4688765561721913, "grad_norm": 0.23931332134441274, "learning_rate": 1.9956848305064156e-06, "loss": 0.1348, "step": 3265 }, { "epoch": 1.4711264436778162, "grad_norm": 0.2443591943685552, "learning_rate": 1.9800088830494233e-06, "loss": 0.1616, "step": 3270 }, { "epoch": 1.4733763311834407, "grad_norm": 0.2583750411808441, "learning_rate": 1.964379530886066e-06, "loss": 0.1568, "step": 3275 }, { "epoch": 1.4756262186890656, "grad_norm": 0.23149031179647173, "learning_rate": 1.948797015160845e-06, "loss": 0.1628, "step": 3280 }, { "epoch": 1.4778761061946903, "grad_norm": 0.24603772896490778, "learning_rate": 1.9332615762956252e-06, "loss": 0.172, "step": 3285 }, { "epoch": 1.480125993700315, "grad_norm": 0.21815434321755453, "learning_rate": 1.9177734539859246e-06, "loss": 0.1412, "step": 3290 }, { "epoch": 1.4823758812059398, "grad_norm": 0.22721807904843327, "learning_rate": 1.9023328871972163e-06, "loss": 0.1535, "step": 3295 }, { "epoch": 1.4846257687115645, "grad_norm": 0.242196598571302, "learning_rate": 1.886940114161241e-06, "loss": 0.1418, "step": 3300 }, { "epoch": 1.4868756562171892, "grad_norm": 0.31031701723743615, "learning_rate": 1.8715953723723374e-06, "loss": 0.152, "step": 3305 }, { "epoch": 1.489125543722814, "grad_norm": 0.2768031393228084, "learning_rate": 1.8562988985837632e-06, "loss": 0.1333, "step": 3310 }, { "epoch": 1.4913754312284386, "grad_norm": 0.20415150169731586, "learning_rate": 1.8410509288040557e-06, "loss": 0.1414, "step": 3315 }, { "epoch": 1.4936253187340633, "grad_norm": 0.26667619653525043, "learning_rate": 1.8258516982933905e-06, "loss": 0.1468, "step": 3320 }, { "epoch": 1.495875206239688, "grad_norm": 0.2944474959496459, "learning_rate": 1.8107014415599416e-06, "loss": 0.1191, "step": 3325 }, { "epoch": 1.4981250937453128, "grad_norm": 0.2393057029593332, "learning_rate": 1.7956003923562715e-06, "loss": 0.1404, "step": 3330 }, { "epoch": 1.5003749812509375, "grad_norm": 0.2714540493615385, "learning_rate": 1.7805487836757224e-06, "loss": 0.139, "step": 3335 }, { "epoch": 1.5026248687565622, "grad_norm": 0.24328575957899592, "learning_rate": 1.7655468477488191e-06, "loss": 0.1388, "step": 3340 }, { "epoch": 1.504874756262187, "grad_norm": 0.22919700731255085, "learning_rate": 1.7505948160396901e-06, "loss": 0.1238, "step": 3345 }, { "epoch": 1.5071246437678116, "grad_norm": 0.2619372461489788, "learning_rate": 1.7356929192424937e-06, "loss": 0.1451, "step": 3350 }, { "epoch": 1.5093745312734363, "grad_norm": 0.2086634801533094, "learning_rate": 1.720841387277858e-06, "loss": 0.1485, "step": 3355 }, { "epoch": 1.511624418779061, "grad_norm": 0.22980427576347195, "learning_rate": 1.7060404492893345e-06, "loss": 0.1474, "step": 3360 }, { "epoch": 1.5138743062846858, "grad_norm": 0.24590985962229212, "learning_rate": 1.6912903336398677e-06, "loss": 0.1375, "step": 3365 }, { "epoch": 1.5161241937903105, "grad_norm": 0.25732491354780235, "learning_rate": 1.6765912679082592e-06, "loss": 0.1357, "step": 3370 }, { "epoch": 1.5183740812959352, "grad_norm": 0.26500627710086616, "learning_rate": 1.6619434788856664e-06, "loss": 0.1419, "step": 3375 }, { "epoch": 1.52062396880156, "grad_norm": 0.2928540052735079, "learning_rate": 1.647347192572105e-06, "loss": 0.1307, "step": 3380 }, { "epoch": 1.5228738563071846, "grad_norm": 0.21671093221137389, "learning_rate": 1.6328026341729547e-06, "loss": 0.1269, "step": 3385 }, { "epoch": 1.5251237438128094, "grad_norm": 0.20008846223184612, "learning_rate": 1.618310028095486e-06, "loss": 0.1229, "step": 3390 }, { "epoch": 1.527373631318434, "grad_norm": 0.27258283595012933, "learning_rate": 1.6038695979454033e-06, "loss": 0.1291, "step": 3395 }, { "epoch": 1.5296235188240588, "grad_norm": 0.1659681540695972, "learning_rate": 1.589481566523388e-06, "loss": 0.1132, "step": 3400 }, { "epoch": 1.5318734063296835, "grad_norm": 0.29801173110625, "learning_rate": 1.5751461558216662e-06, "loss": 0.1557, "step": 3405 }, { "epoch": 1.5341232938353082, "grad_norm": 0.21585855046664817, "learning_rate": 1.5608635870205813e-06, "loss": 0.1275, "step": 3410 }, { "epoch": 1.536373181340933, "grad_norm": 0.22691724613303907, "learning_rate": 1.546634080485181e-06, "loss": 0.1263, "step": 3415 }, { "epoch": 1.5386230688465576, "grad_norm": 0.3126367672650283, "learning_rate": 1.5324578557618158e-06, "loss": 0.1281, "step": 3420 }, { "epoch": 1.5408729563521824, "grad_norm": 0.2605966357941338, "learning_rate": 1.5183351315747618e-06, "loss": 0.1334, "step": 3425 }, { "epoch": 1.543122843857807, "grad_norm": 0.21630173206179193, "learning_rate": 1.5042661258228268e-06, "loss": 0.1372, "step": 3430 }, { "epoch": 1.5453727313634318, "grad_norm": 0.26829878682557234, "learning_rate": 1.4902510555760052e-06, "loss": 0.1257, "step": 3435 }, { "epoch": 1.5476226188690565, "grad_norm": 0.29544397855593446, "learning_rate": 1.4762901370721266e-06, "loss": 0.1396, "step": 3440 }, { "epoch": 1.5498725063746812, "grad_norm": 0.24819039772864243, "learning_rate": 1.4623835857135099e-06, "loss": 0.1298, "step": 3445 }, { "epoch": 1.552122393880306, "grad_norm": 0.20631029388339692, "learning_rate": 1.4485316160636491e-06, "loss": 0.1135, "step": 3450 }, { "epoch": 1.5543722813859309, "grad_norm": 0.2588845660409855, "learning_rate": 1.434734441843899e-06, "loss": 0.119, "step": 3455 }, { "epoch": 1.5566221688915554, "grad_norm": 0.27482300851220287, "learning_rate": 1.420992275930178e-06, "loss": 0.1228, "step": 3460 }, { "epoch": 1.5588720563971803, "grad_norm": 0.23756828701782703, "learning_rate": 1.4073053303496837e-06, "loss": 0.1397, "step": 3465 }, { "epoch": 1.5611219439028048, "grad_norm": 0.2565129570324179, "learning_rate": 1.3936738162776269e-06, "loss": 0.1171, "step": 3470 }, { "epoch": 1.5633718314084297, "grad_norm": 0.23747517958547196, "learning_rate": 1.3800979440339602e-06, "loss": 0.1112, "step": 3475 }, { "epoch": 1.5656217189140542, "grad_norm": 0.28612053881235616, "learning_rate": 1.3665779230801452e-06, "loss": 0.1214, "step": 3480 }, { "epoch": 1.5678716064196792, "grad_norm": 0.21974317229860285, "learning_rate": 1.353113962015919e-06, "loss": 0.1133, "step": 3485 }, { "epoch": 1.5701214939253036, "grad_norm": 0.2586485593990932, "learning_rate": 1.3397062685760715e-06, "loss": 0.131, "step": 3490 }, { "epoch": 1.5723713814309286, "grad_norm": 0.24428264502478964, "learning_rate": 1.326355049627238e-06, "loss": 0.1239, "step": 3495 }, { "epoch": 1.574621268936553, "grad_norm": 0.2440528536513943, "learning_rate": 1.31306051116472e-06, "loss": 0.1156, "step": 3500 }, { "epoch": 1.576871156442178, "grad_norm": 0.25949845916894754, "learning_rate": 1.299822858309292e-06, "loss": 0.118, "step": 3505 }, { "epoch": 1.5791210439478025, "grad_norm": 0.27496698683134035, "learning_rate": 1.2866422953040458e-06, "loss": 0.1364, "step": 3510 }, { "epoch": 1.5813709314534274, "grad_norm": 0.22644679203329376, "learning_rate": 1.273519025511236e-06, "loss": 0.1242, "step": 3515 }, { "epoch": 1.583620818959052, "grad_norm": 0.27460451136628766, "learning_rate": 1.2604532514091444e-06, "loss": 0.1179, "step": 3520 }, { "epoch": 1.5858707064646769, "grad_norm": 0.20903077719719648, "learning_rate": 1.2474451745889516e-06, "loss": 0.1174, "step": 3525 }, { "epoch": 1.5881205939703014, "grad_norm": 0.24497256736111866, "learning_rate": 1.2344949957516356e-06, "loss": 0.1119, "step": 3530 }, { "epoch": 1.5903704814759263, "grad_norm": 0.2539761952183662, "learning_rate": 1.221602914704862e-06, "loss": 0.1219, "step": 3535 }, { "epoch": 1.5926203689815508, "grad_norm": 0.23677806854402075, "learning_rate": 1.2087691303599109e-06, "loss": 0.1131, "step": 3540 }, { "epoch": 1.5948702564871757, "grad_norm": 0.28943703222233913, "learning_rate": 1.1959938407286099e-06, "loss": 0.1265, "step": 3545 }, { "epoch": 1.5971201439928002, "grad_norm": 0.26936033145050353, "learning_rate": 1.1832772429202716e-06, "loss": 0.1155, "step": 3550 }, { "epoch": 1.5993700314984252, "grad_norm": 0.24859595390547068, "learning_rate": 1.1706195331386494e-06, "loss": 0.1319, "step": 3555 }, { "epoch": 1.6016199190040497, "grad_norm": 0.21137786230401104, "learning_rate": 1.1580209066789272e-06, "loss": 0.0959, "step": 3560 }, { "epoch": 1.6038698065096746, "grad_norm": 0.18826307259382147, "learning_rate": 1.1454815579246874e-06, "loss": 0.1162, "step": 3565 }, { "epoch": 1.606119694015299, "grad_norm": 0.2059875775498964, "learning_rate": 1.1330016803449224e-06, "loss": 0.1079, "step": 3570 }, { "epoch": 1.608369581520924, "grad_norm": 0.28679664201908944, "learning_rate": 1.1205814664910464e-06, "loss": 0.1323, "step": 3575 }, { "epoch": 1.6106194690265485, "grad_norm": 0.23898491505271052, "learning_rate": 1.1082211079939248e-06, "loss": 0.1, "step": 3580 }, { "epoch": 1.6128693565321734, "grad_norm": 0.2700410969754371, "learning_rate": 1.0959207955609163e-06, "loss": 0.107, "step": 3585 }, { "epoch": 1.6151192440377982, "grad_norm": 0.21154102841364958, "learning_rate": 1.083680718972938e-06, "loss": 0.1126, "step": 3590 }, { "epoch": 1.6173691315434229, "grad_norm": 0.19582804111079785, "learning_rate": 1.0715010670815212e-06, "loss": 0.1111, "step": 3595 }, { "epoch": 1.6196190190490476, "grad_norm": 0.2154658262674778, "learning_rate": 1.059382027805914e-06, "loss": 0.1025, "step": 3600 }, { "epoch": 1.6218689065546723, "grad_norm": 0.30677049526532074, "learning_rate": 1.0473237881301763e-06, "loss": 0.1201, "step": 3605 }, { "epoch": 1.624118794060297, "grad_norm": 0.23614701148998188, "learning_rate": 1.0353265341002916e-06, "loss": 0.104, "step": 3610 }, { "epoch": 1.6263686815659217, "grad_norm": 0.24690793627028748, "learning_rate": 1.0233904508212955e-06, "loss": 0.1078, "step": 3615 }, { "epoch": 1.6286185690715465, "grad_norm": 0.31258122069910355, "learning_rate": 1.0115157224544313e-06, "loss": 0.1036, "step": 3620 }, { "epoch": 1.6308684565771712, "grad_norm": 0.23164502515145138, "learning_rate": 9.997025322142934e-07, "loss": 0.1082, "step": 3625 }, { "epoch": 1.6331183440827959, "grad_norm": 0.24106743301610264, "learning_rate": 9.87951062366011e-07, "loss": 0.1197, "step": 3630 }, { "epoch": 1.6353682315884206, "grad_norm": 0.26573732918101894, "learning_rate": 9.762614942224312e-07, "loss": 0.1205, "step": 3635 }, { "epoch": 1.6376181190940453, "grad_norm": 0.11482579343049812, "learning_rate": 9.646340081413225e-07, "loss": 0.0915, "step": 3640 }, { "epoch": 1.63986800659967, "grad_norm": 0.21075003083613178, "learning_rate": 9.530687835225916e-07, "loss": 0.097, "step": 3645 }, { "epoch": 1.6421178941052947, "grad_norm": 0.23753636897967206, "learning_rate": 9.415659988055215e-07, "loss": 0.1042, "step": 3650 }, { "epoch": 1.6443677816109195, "grad_norm": 0.24065236956197258, "learning_rate": 9.30125831466005e-07, "loss": 0.1021, "step": 3655 }, { "epoch": 1.6466176691165442, "grad_norm": 0.28653116996485667, "learning_rate": 9.187484580138184e-07, "loss": 0.1153, "step": 3660 }, { "epoch": 1.6488675566221689, "grad_norm": 0.2563141158700858, "learning_rate": 9.074340539898962e-07, "loss": 0.1106, "step": 3665 }, { "epoch": 1.6511174441277936, "grad_norm": 0.29385405444476315, "learning_rate": 8.961827939636198e-07, "loss": 0.1087, "step": 3670 }, { "epoch": 1.6533673316334183, "grad_norm": 0.2631541250069433, "learning_rate": 8.849948515301188e-07, "loss": 0.0978, "step": 3675 }, { "epoch": 1.655617219139043, "grad_norm": 0.2722585679724958, "learning_rate": 8.738703993076087e-07, "loss": 0.109, "step": 3680 }, { "epoch": 1.6578671066446677, "grad_norm": 0.3065358825170482, "learning_rate": 8.62809608934711e-07, "loss": 0.1019, "step": 3685 }, { "epoch": 1.6601169941502925, "grad_norm": 0.23763230365231583, "learning_rate": 8.518126510678138e-07, "loss": 0.1138, "step": 3690 }, { "epoch": 1.6623668816559172, "grad_norm": 0.2532402386408982, "learning_rate": 8.408796953784365e-07, "loss": 0.1102, "step": 3695 }, { "epoch": 1.6646167691615419, "grad_norm": 0.23005454958970656, "learning_rate": 8.30010910550611e-07, "loss": 0.1017, "step": 3700 }, { "epoch": 1.6668666566671666, "grad_norm": 0.2194832160899072, "learning_rate": 8.19206464278281e-07, "loss": 0.0985, "step": 3705 }, { "epoch": 1.6691165441727913, "grad_norm": 0.32261159442961446, "learning_rate": 8.084665232627165e-07, "loss": 0.1115, "step": 3710 }, { "epoch": 1.671366431678416, "grad_norm": 0.28020915769071963, "learning_rate": 7.977912532099336e-07, "loss": 0.1072, "step": 3715 }, { "epoch": 1.6736163191840407, "grad_norm": 0.2587579519713862, "learning_rate": 7.871808188281461e-07, "loss": 0.0884, "step": 3720 }, { "epoch": 1.6758662066896655, "grad_norm": 0.25937560152984207, "learning_rate": 7.766353838252227e-07, "loss": 0.0963, "step": 3725 }, { "epoch": 1.6781160941952904, "grad_norm": 0.2623209006276337, "learning_rate": 7.661551109061593e-07, "loss": 0.0945, "step": 3730 }, { "epoch": 1.680365981700915, "grad_norm": 0.2734268883455671, "learning_rate": 7.557401617705673e-07, "loss": 0.0962, "step": 3735 }, { "epoch": 1.6826158692065398, "grad_norm": 0.2600567478099387, "learning_rate": 7.453906971101826e-07, "loss": 0.0965, "step": 3740 }, { "epoch": 1.6848657567121643, "grad_norm": 0.21549655386834185, "learning_rate": 7.35106876606384e-07, "loss": 0.0802, "step": 3745 }, { "epoch": 1.6871156442177893, "grad_norm": 0.23243787528580465, "learning_rate": 7.248888589277275e-07, "loss": 0.0979, "step": 3750 }, { "epoch": 1.6893655317234137, "grad_norm": 0.2724000087724297, "learning_rate": 7.147368017275075e-07, "loss": 0.0954, "step": 3755 }, { "epoch": 1.6916154192290387, "grad_norm": 0.27067809227580786, "learning_rate": 7.046508616413078e-07, "loss": 0.0921, "step": 3760 }, { "epoch": 1.6938653067346632, "grad_norm": 0.2211597100684428, "learning_rate": 6.946311942846002e-07, "loss": 0.1051, "step": 3765 }, { "epoch": 1.6961151942402881, "grad_norm": 0.24002929405082607, "learning_rate": 6.846779542503384e-07, "loss": 0.0899, "step": 3770 }, { "epoch": 1.6983650817459126, "grad_norm": 0.19994511733272957, "learning_rate": 6.747912951065722e-07, "loss": 0.0914, "step": 3775 }, { "epoch": 1.7006149692515375, "grad_norm": 0.3127468963077912, "learning_rate": 6.649713693940718e-07, "loss": 0.1032, "step": 3780 }, { "epoch": 1.702864856757162, "grad_norm": 0.23642047104133684, "learning_rate": 6.552183286239899e-07, "loss": 0.087, "step": 3785 }, { "epoch": 1.705114744262787, "grad_norm": 0.2068805673647048, "learning_rate": 6.455323232755095e-07, "loss": 0.093, "step": 3790 }, { "epoch": 1.7073646317684115, "grad_norm": 0.26466757083784725, "learning_rate": 6.35913502793527e-07, "loss": 0.0857, "step": 3795 }, { "epoch": 1.7096145192740364, "grad_norm": 0.20726909159845547, "learning_rate": 6.263620155863492e-07, "loss": 0.0863, "step": 3800 }, { "epoch": 1.711864406779661, "grad_norm": 0.21055369695393691, "learning_rate": 6.168780090233994e-07, "loss": 0.0916, "step": 3805 }, { "epoch": 1.7141142942852858, "grad_norm": 0.2778245613335268, "learning_rate": 6.07461629432945e-07, "loss": 0.0917, "step": 3810 }, { "epoch": 1.7163641817909103, "grad_norm": 0.1999858453456665, "learning_rate": 5.981130220998444e-07, "loss": 0.0746, "step": 3815 }, { "epoch": 1.7186140692965353, "grad_norm": 0.19402830775044652, "learning_rate": 5.888323312632948e-07, "loss": 0.094, "step": 3820 }, { "epoch": 1.7208639568021598, "grad_norm": 0.26795633448623635, "learning_rate": 5.796197001146164e-07, "loss": 0.0884, "step": 3825 }, { "epoch": 1.7231138443077847, "grad_norm": 0.2390770570872304, "learning_rate": 5.704752707950412e-07, "loss": 0.0905, "step": 3830 }, { "epoch": 1.7253637318134092, "grad_norm": 0.16942131967267335, "learning_rate": 5.613991843935179e-07, "loss": 0.0827, "step": 3835 }, { "epoch": 1.7276136193190341, "grad_norm": 0.2027819269347922, "learning_rate": 5.523915809445313e-07, "loss": 0.0832, "step": 3840 }, { "epoch": 1.7298635068246586, "grad_norm": 0.22046848444535852, "learning_rate": 5.434525994259531e-07, "loss": 0.0886, "step": 3845 }, { "epoch": 1.7321133943302836, "grad_norm": 0.2595049287219421, "learning_rate": 5.345823777568859e-07, "loss": 0.0937, "step": 3850 }, { "epoch": 1.734363281835908, "grad_norm": 0.24929926941084485, "learning_rate": 5.25781052795541e-07, "loss": 0.0787, "step": 3855 }, { "epoch": 1.736613169341533, "grad_norm": 0.24888880504119226, "learning_rate": 5.170487603371266e-07, "loss": 0.0845, "step": 3860 }, { "epoch": 1.7388630568471577, "grad_norm": 0.2604651193142029, "learning_rate": 5.083856351117511e-07, "loss": 0.0786, "step": 3865 }, { "epoch": 1.7411129443527824, "grad_norm": 0.20003829357925593, "learning_rate": 4.997918107823446e-07, "loss": 0.08, "step": 3870 }, { "epoch": 1.7433628318584071, "grad_norm": 0.2051972235262297, "learning_rate": 4.912674199425999e-07, "loss": 0.0853, "step": 3875 }, { "epoch": 1.7456127193640318, "grad_norm": 0.2549934939375718, "learning_rate": 4.828125941149197e-07, "loss": 0.0844, "step": 3880 }, { "epoch": 1.7478626068696566, "grad_norm": 0.3024383705811877, "learning_rate": 4.7442746374839363e-07, "loss": 0.0846, "step": 3885 }, { "epoch": 1.7501124943752813, "grad_norm": 0.21907085756014216, "learning_rate": 4.6611215821678546e-07, "loss": 0.0839, "step": 3890 }, { "epoch": 1.752362381880906, "grad_norm": 0.2241634174428953, "learning_rate": 4.578668058165325e-07, "loss": 0.0758, "step": 3895 }, { "epoch": 1.7546122693865307, "grad_norm": 0.26223122192387566, "learning_rate": 4.4969153376476726e-07, "loss": 0.0814, "step": 3900 }, { "epoch": 1.7568621568921554, "grad_norm": 0.3009548280743066, "learning_rate": 4.415864681973608e-07, "loss": 0.079, "step": 3905 }, { "epoch": 1.7591120443977801, "grad_norm": 0.2282784959309563, "learning_rate": 4.335517341669676e-07, "loss": 0.084, "step": 3910 }, { "epoch": 1.7613619319034048, "grad_norm": 0.2736169743202772, "learning_rate": 4.255874556411016e-07, "loss": 0.0845, "step": 3915 }, { "epoch": 1.7636118194090296, "grad_norm": 0.2273826431677496, "learning_rate": 4.176937555002231e-07, "loss": 0.0789, "step": 3920 }, { "epoch": 1.7658617069146543, "grad_norm": 0.2562359538975016, "learning_rate": 4.098707555358411e-07, "loss": 0.0841, "step": 3925 }, { "epoch": 1.768111594420279, "grad_norm": 0.20875979878240594, "learning_rate": 4.0211857644863404e-07, "loss": 0.0868, "step": 3930 }, { "epoch": 1.7703614819259037, "grad_norm": 0.23466550669048516, "learning_rate": 3.9443733784659324e-07, "loss": 0.0863, "step": 3935 }, { "epoch": 1.7726113694315284, "grad_norm": 0.22510691667210447, "learning_rate": 3.8682715824316594e-07, "loss": 0.0966, "step": 3940 }, { "epoch": 1.7748612569371531, "grad_norm": 0.20906650720115227, "learning_rate": 3.792881550554373e-07, "loss": 0.0792, "step": 3945 }, { "epoch": 1.7771111444427778, "grad_norm": 0.27079014394170864, "learning_rate": 3.7182044460231605e-07, "loss": 0.0793, "step": 3950 }, { "epoch": 1.7793610319484026, "grad_norm": 0.1857139907781371, "learning_rate": 3.6442414210273834e-07, "loss": 0.0798, "step": 3955 }, { "epoch": 1.7816109194540273, "grad_norm": 0.2556479800636284, "learning_rate": 3.570993616738866e-07, "loss": 0.0848, "step": 3960 }, { "epoch": 1.783860806959652, "grad_norm": 0.20166968130742072, "learning_rate": 3.498462163294386e-07, "loss": 0.0811, "step": 3965 }, { "epoch": 1.7861106944652767, "grad_norm": 0.2536060487731229, "learning_rate": 3.426648179778147e-07, "loss": 0.0953, "step": 3970 }, { "epoch": 1.7883605819709014, "grad_norm": 0.2112175288890015, "learning_rate": 3.355552774204551e-07, "loss": 0.0762, "step": 3975 }, { "epoch": 1.7906104694765261, "grad_norm": 0.2295537149666403, "learning_rate": 3.2851770435010864e-07, "loss": 0.0767, "step": 3980 }, { "epoch": 1.7928603569821508, "grad_norm": 0.2866042831708544, "learning_rate": 3.215522073491434e-07, "loss": 0.0822, "step": 3985 }, { "epoch": 1.7951102444877756, "grad_norm": 0.21056995297624528, "learning_rate": 3.1465889388786697e-07, "loss": 0.0884, "step": 3990 }, { "epoch": 1.7973601319934003, "grad_norm": 0.2722790864581489, "learning_rate": 3.0783787032287407e-07, "loss": 0.0881, "step": 3995 }, { "epoch": 1.799610019499025, "grad_norm": 0.2828340673761126, "learning_rate": 3.010892418953981e-07, "loss": 0.0791, "step": 4000 }, { "epoch": 1.80185990700465, "grad_norm": 0.2144007429401686, "learning_rate": 2.9441311272969343e-07, "loss": 0.067, "step": 4005 }, { "epoch": 1.8041097945102744, "grad_norm": 0.21922591950990084, "learning_rate": 2.878095858314278e-07, "loss": 0.0702, "step": 4010 }, { "epoch": 1.8063596820158994, "grad_norm": 0.20896747701719126, "learning_rate": 2.812787630860919e-07, "loss": 0.078, "step": 4015 }, { "epoch": 1.8086095695215239, "grad_norm": 0.21476087685701412, "learning_rate": 2.7482074525742477e-07, "loss": 0.0688, "step": 4020 }, { "epoch": 1.8108594570271488, "grad_norm": 0.24301133633323727, "learning_rate": 2.6843563198586553e-07, "loss": 0.0804, "step": 4025 }, { "epoch": 1.8131093445327733, "grad_norm": 0.23191122306412676, "learning_rate": 2.621235217870116e-07, "loss": 0.0861, "step": 4030 }, { "epoch": 1.8153592320383982, "grad_norm": 0.207067846018882, "learning_rate": 2.55884512050098e-07, "loss": 0.0886, "step": 4035 }, { "epoch": 1.8176091195440227, "grad_norm": 0.22298796620779232, "learning_rate": 2.4971869903649916e-07, "loss": 0.0841, "step": 4040 }, { "epoch": 1.8198590070496476, "grad_norm": 0.2933340830070678, "learning_rate": 2.436261778782378e-07, "loss": 0.0794, "step": 4045 }, { "epoch": 1.8221088945552721, "grad_norm": 0.21972032956327708, "learning_rate": 2.3760704257652145e-07, "loss": 0.0774, "step": 4050 }, { "epoch": 1.824358782060897, "grad_norm": 0.2597840708263632, "learning_rate": 2.3166138600029198e-07, "loss": 0.0772, "step": 4055 }, { "epoch": 1.8266086695665216, "grad_norm": 0.24945833711183132, "learning_rate": 2.257892998847916e-07, "loss": 0.0758, "step": 4060 }, { "epoch": 1.8288585570721465, "grad_norm": 0.2108872276998458, "learning_rate": 2.1999087483014437e-07, "loss": 0.0742, "step": 4065 }, { "epoch": 1.831108444577771, "grad_norm": 0.2561087507310594, "learning_rate": 2.1426620029996516e-07, "loss": 0.078, "step": 4070 }, { "epoch": 1.833358332083396, "grad_norm": 0.18787526118382977, "learning_rate": 2.08615364619974e-07, "loss": 0.0679, "step": 4075 }, { "epoch": 1.8356082195890204, "grad_norm": 0.224194236181041, "learning_rate": 2.0303845497663566e-07, "loss": 0.0746, "step": 4080 }, { "epoch": 1.8378581070946454, "grad_norm": 0.2392479281560687, "learning_rate": 1.9753555741581277e-07, "loss": 0.0764, "step": 4085 }, { "epoch": 1.8401079946002699, "grad_norm": 0.21912472032156466, "learning_rate": 1.921067568414403e-07, "loss": 0.079, "step": 4090 }, { "epoch": 1.8423578821058948, "grad_norm": 0.25670091205707113, "learning_rate": 1.8675213701421223e-07, "loss": 0.0835, "step": 4095 }, { "epoch": 1.8446077696115193, "grad_norm": 0.2594177639066407, "learning_rate": 1.814717805502958e-07, "loss": 0.0803, "step": 4100 }, { "epoch": 1.8468576571171442, "grad_norm": 0.21469179882031758, "learning_rate": 1.762657689200481e-07, "loss": 0.0764, "step": 4105 }, { "epoch": 1.8491075446227687, "grad_norm": 0.2209544552641529, "learning_rate": 1.7113418244676493e-07, "loss": 0.0785, "step": 4110 }, { "epoch": 1.8513574321283937, "grad_norm": 0.24145566509686753, "learning_rate": 1.6607710030544122e-07, "loss": 0.0719, "step": 4115 }, { "epoch": 1.8536073196340181, "grad_norm": 0.2093424795846333, "learning_rate": 1.6109460052154802e-07, "loss": 0.0764, "step": 4120 }, { "epoch": 1.855857207139643, "grad_norm": 0.24494002119656788, "learning_rate": 1.561867599698258e-07, "loss": 0.0798, "step": 4125 }, { "epoch": 1.8581070946452676, "grad_norm": 0.22975170782618237, "learning_rate": 1.5135365437310534e-07, "loss": 0.0837, "step": 4130 }, { "epoch": 1.8603569821508925, "grad_norm": 0.23509810998937047, "learning_rate": 1.4659535830113368e-07, "loss": 0.0784, "step": 4135 }, { "epoch": 1.8626068696565172, "grad_norm": 0.21476403073025796, "learning_rate": 1.419119451694262e-07, "loss": 0.0735, "step": 4140 }, { "epoch": 1.864856757162142, "grad_norm": 0.206525508501757, "learning_rate": 1.3730348723813181e-07, "loss": 0.0693, "step": 4145 }, { "epoch": 1.8671066446677667, "grad_norm": 0.23677883602034755, "learning_rate": 1.3277005561092016e-07, "loss": 0.0765, "step": 4150 }, { "epoch": 1.8693565321733914, "grad_norm": 0.22864240045396528, "learning_rate": 1.2831172023388349e-07, "loss": 0.0682, "step": 4155 }, { "epoch": 1.871606419679016, "grad_norm": 0.23988380079630575, "learning_rate": 1.2392854989445925e-07, "loss": 0.0792, "step": 4160 }, { "epoch": 1.8738563071846408, "grad_norm": 0.21420842660768485, "learning_rate": 1.196206122203647e-07, "loss": 0.0723, "step": 4165 }, { "epoch": 1.8761061946902655, "grad_norm": 0.23599002078153936, "learning_rate": 1.153879736785568e-07, "loss": 0.0745, "step": 4170 }, { "epoch": 1.8783560821958902, "grad_norm": 0.19939870757943454, "learning_rate": 1.112306995742074e-07, "loss": 0.0764, "step": 4175 }, { "epoch": 1.880605969701515, "grad_norm": 0.3002967324611831, "learning_rate": 1.0714885404969288e-07, "loss": 0.0745, "step": 4180 }, { "epoch": 1.8828558572071397, "grad_norm": 0.2534936155963215, "learning_rate": 1.031425000836056e-07, "loss": 0.0805, "step": 4185 }, { "epoch": 1.8851057447127644, "grad_norm": 0.2099770583680312, "learning_rate": 9.921169948978293e-08, "loss": 0.0696, "step": 4190 }, { "epoch": 1.887355632218389, "grad_norm": 0.21406906185927677, "learning_rate": 9.535651291635362e-08, "loss": 0.0695, "step": 4195 }, { "epoch": 1.8896055197240138, "grad_norm": 0.222365141419886, "learning_rate": 9.157699984480018e-08, "loss": 0.0747, "step": 4200 }, { "epoch": 1.8918554072296385, "grad_norm": 0.26953866043532604, "learning_rate": 8.787321858904241e-08, "loss": 0.0707, "step": 4205 }, { "epoch": 1.8941052947352632, "grad_norm": 0.2625647283056072, "learning_rate": 8.424522629453924e-08, "loss": 0.0743, "step": 4210 }, { "epoch": 1.896355182240888, "grad_norm": 0.23612000625927326, "learning_rate": 8.06930789374033e-08, "loss": 0.0763, "step": 4215 }, { "epoch": 1.8986050697465127, "grad_norm": 0.20902097825740532, "learning_rate": 7.721683132354163e-08, "loss": 0.0744, "step": 4220 }, { "epoch": 1.9008549572521374, "grad_norm": 0.24284466003504482, "learning_rate": 7.381653708780578e-08, "loss": 0.0741, "step": 4225 }, { "epoch": 1.903104844757762, "grad_norm": 0.208669560652797, "learning_rate": 7.049224869316807e-08, "loss": 0.0711, "step": 4230 }, { "epoch": 1.9053547322633868, "grad_norm": 0.2250655987088142, "learning_rate": 6.724401742990993e-08, "loss": 0.0689, "step": 4235 }, { "epoch": 1.9076046197690115, "grad_norm": 0.21694307565283746, "learning_rate": 6.407189341483044e-08, "loss": 0.0761, "step": 4240 }, { "epoch": 1.9098545072746362, "grad_norm": 0.19311480533839126, "learning_rate": 6.097592559047405e-08, "loss": 0.0743, "step": 4245 }, { "epoch": 1.912104394780261, "grad_norm": 0.18525264406718234, "learning_rate": 5.795616172437624e-08, "loss": 0.0725, "step": 4250 }, { "epoch": 1.9143542822858857, "grad_norm": 0.1651176792199085, "learning_rate": 5.501264840832299e-08, "loss": 0.0761, "step": 4255 }, { "epoch": 1.9166041697915104, "grad_norm": 0.2260442730154628, "learning_rate": 5.214543105763692e-08, "loss": 0.0889, "step": 4260 }, { "epoch": 1.918854057297135, "grad_norm": 0.18715741530378352, "learning_rate": 4.935455391047228e-08, "loss": 0.0663, "step": 4265 }, { "epoch": 1.9211039448027598, "grad_norm": 0.27005890292615836, "learning_rate": 4.664006002713495e-08, "loss": 0.0728, "step": 4270 }, { "epoch": 1.9233538323083845, "grad_norm": 0.2849847348297522, "learning_rate": 4.400199128941573e-08, "loss": 0.077, "step": 4275 }, { "epoch": 1.9256037198140095, "grad_norm": 0.21209255590676854, "learning_rate": 4.1440388399948686e-08, "loss": 0.0668, "step": 4280 }, { "epoch": 1.927853607319634, "grad_norm": 0.24917025560084188, "learning_rate": 3.8955290881576566e-08, "loss": 0.0731, "step": 4285 }, { "epoch": 1.930103494825259, "grad_norm": 0.18031075827239754, "learning_rate": 3.654673707674639e-08, "loss": 0.0755, "step": 4290 }, { "epoch": 1.9323533823308834, "grad_norm": 0.29159777908954887, "learning_rate": 3.4214764146915936e-08, "loss": 0.0771, "step": 4295 }, { "epoch": 1.9346032698365083, "grad_norm": 0.2515730214349518, "learning_rate": 3.195940807198039e-08, "loss": 0.0718, "step": 4300 }, { "epoch": 1.9368531573421328, "grad_norm": 0.2359716229508164, "learning_rate": 2.9780703649716637e-08, "loss": 0.075, "step": 4305 }, { "epoch": 1.9391030448477578, "grad_norm": 0.2279891015303591, "learning_rate": 2.767868449524813e-08, "loss": 0.071, "step": 4310 }, { "epoch": 1.9413529323533822, "grad_norm": 0.24505091349172803, "learning_rate": 2.5653383040524228e-08, "loss": 0.074, "step": 4315 }, { "epoch": 1.9436028198590072, "grad_norm": 0.2478550376131261, "learning_rate": 2.370483053382111e-08, "loss": 0.0771, "step": 4320 }, { "epoch": 1.9458527073646317, "grad_norm": 0.2193182868657674, "learning_rate": 2.183305703925831e-08, "loss": 0.0756, "step": 4325 }, { "epoch": 1.9481025948702566, "grad_norm": 0.2746040123293314, "learning_rate": 2.0038091436337392e-08, "loss": 0.0684, "step": 4330 }, { "epoch": 1.950352482375881, "grad_norm": 0.27001950154682536, "learning_rate": 1.8319961419493436e-08, "loss": 0.0776, "step": 4335 }, { "epoch": 1.952602369881506, "grad_norm": 0.17878344688895553, "learning_rate": 1.667869349766982e-08, "loss": 0.0684, "step": 4340 }, { "epoch": 1.9548522573871305, "grad_norm": 0.2242691348688971, "learning_rate": 1.5114312993908532e-08, "loss": 0.0775, "step": 4345 }, { "epoch": 1.9571021448927555, "grad_norm": 0.30659381638740507, "learning_rate": 1.3626844044957733e-08, "loss": 0.0742, "step": 4350 }, { "epoch": 1.95935203239838, "grad_norm": 0.22927922548010668, "learning_rate": 1.2216309600903142e-08, "loss": 0.0758, "step": 4355 }, { "epoch": 1.961601919904005, "grad_norm": 0.24862858256846118, "learning_rate": 1.088273142481111e-08, "loss": 0.0716, "step": 4360 }, { "epoch": 1.9638518074096294, "grad_norm": 0.18094046454860382, "learning_rate": 9.626130092393326e-09, "loss": 0.0708, "step": 4365 }, { "epoch": 1.9661016949152543, "grad_norm": 0.22402544663253343, "learning_rate": 8.446524991689298e-09, "loss": 0.0724, "step": 4370 }, { "epoch": 1.9683515824208788, "grad_norm": 0.1742440920374198, "learning_rate": 7.343934322767699e-09, "loss": 0.0776, "step": 4375 }, { "epoch": 1.9706014699265038, "grad_norm": 0.22337791056104178, "learning_rate": 6.318375097446039e-09, "loss": 0.0662, "step": 4380 }, { "epoch": 1.9728513574321282, "grad_norm": 0.20821619733680874, "learning_rate": 5.369863139026432e-09, "loss": 0.0823, "step": 4385 }, { "epoch": 1.9751012449377532, "grad_norm": 0.28459129908034536, "learning_rate": 4.498413082053566e-09, "loss": 0.0847, "step": 4390 }, { "epoch": 1.9773511324433777, "grad_norm": 0.2487528636409247, "learning_rate": 3.704038372085994e-09, "loss": 0.0812, "step": 4395 }, { "epoch": 1.9796010199490026, "grad_norm": 0.20877793863085078, "learning_rate": 2.986751265493526e-09, "loss": 0.0726, "step": 4400 }, { "epoch": 1.981850907454627, "grad_norm": 0.2843353007301253, "learning_rate": 2.3465628292623776e-09, "loss": 0.0696, "step": 4405 }, { "epoch": 1.984100794960252, "grad_norm": 0.20964930087522637, "learning_rate": 1.7834829408286402e-09, "loss": 0.0669, "step": 4410 }, { "epoch": 1.9863506824658768, "grad_norm": 0.18510001166373868, "learning_rate": 1.297520287923404e-09, "loss": 0.0732, "step": 4415 }, { "epoch": 1.9886005699715015, "grad_norm": 0.1927246478831311, "learning_rate": 8.886823684417512e-10, "loss": 0.0798, "step": 4420 }, { "epoch": 1.9908504574771262, "grad_norm": 0.1773427078092757, "learning_rate": 5.56975490322853e-10, "loss": 0.0759, "step": 4425 }, { "epoch": 1.993100344982751, "grad_norm": 0.2305382589496537, "learning_rate": 3.0240477145559997e-10, "loss": 0.0862, "step": 4430 }, { "epoch": 1.9953502324883756, "grad_norm": 0.28373308100373656, "learning_rate": 1.24974139599221e-10, "loss": 0.0769, "step": 4435 }, { "epoch": 1.9976001199940003, "grad_norm": 0.19382710057534525, "learning_rate": 2.4686332322221286e-11, "loss": 0.0755, "step": 4440 }, { "epoch": 1.9994000299985002, "eval_loss": 0.08069541305303574, "eval_runtime": 54.013, "eval_samples_per_second": 19.958, "eval_steps_per_second": 4.999, "step": 4444 }, { "epoch": 1.9994000299985002, "step": 4444, "total_flos": 1.2064109084748546e+19, "train_loss": 0.2652757017731559, "train_runtime": 35452.6785, "train_samples_per_second": 6.017, "train_steps_per_second": 0.125 } ], "logging_steps": 5, "max_steps": 4444, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2064109084748546e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }