{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1911, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005232862375719519, "grad_norm": 1.6008316731720644, "learning_rate": 5.208333333333333e-08, "loss": 1.681, "step": 1 }, { "epoch": 0.0026164311878597592, "grad_norm": 1.5124995634557081, "learning_rate": 2.604166666666667e-07, "loss": 1.7583, "step": 5 }, { "epoch": 0.0052328623757195184, "grad_norm": 1.6881183838300193, "learning_rate": 5.208333333333334e-07, "loss": 1.7784, "step": 10 }, { "epoch": 0.007849293563579277, "grad_norm": 1.6569959855142122, "learning_rate": 7.8125e-07, "loss": 1.7571, "step": 15 }, { "epoch": 0.010465724751439037, "grad_norm": 1.4876244750187468, "learning_rate": 1.0416666666666667e-06, "loss": 1.7796, "step": 20 }, { "epoch": 0.013082155939298797, "grad_norm": 1.555761827132201, "learning_rate": 1.3020833333333335e-06, "loss": 1.7455, "step": 25 }, { "epoch": 0.015698587127158554, "grad_norm": 1.3344622915583402, "learning_rate": 1.5625e-06, "loss": 1.713, "step": 30 }, { "epoch": 0.018315018315018316, "grad_norm": 1.6878431208167544, "learning_rate": 1.8229166666666666e-06, "loss": 1.753, "step": 35 }, { "epoch": 0.020931449502878074, "grad_norm": 1.925154137160916, "learning_rate": 2.0833333333333334e-06, "loss": 1.7583, "step": 40 }, { "epoch": 0.023547880690737835, "grad_norm": 2.0658602507700143, "learning_rate": 2.3437500000000002e-06, "loss": 1.6918, "step": 45 }, { "epoch": 0.026164311878597593, "grad_norm": 2.2843015724617484, "learning_rate": 2.604166666666667e-06, "loss": 1.6991, "step": 50 }, { "epoch": 0.02878074306645735, "grad_norm": 2.0076765426852905, "learning_rate": 2.8645833333333334e-06, "loss": 1.6578, "step": 55 }, { "epoch": 0.03139717425431711, "grad_norm": 1.5242713335796327, "learning_rate": 3.125e-06, "loss": 1.6201, "step": 60 }, { "epoch": 0.034013605442176874, "grad_norm": 1.202884778723732, "learning_rate": 3.385416666666667e-06, "loss": 1.6545, "step": 65 }, { "epoch": 0.03663003663003663, "grad_norm": 0.9017919097747348, "learning_rate": 3.6458333333333333e-06, "loss": 1.6182, "step": 70 }, { "epoch": 0.03924646781789639, "grad_norm": 0.7155469892761825, "learning_rate": 3.90625e-06, "loss": 1.5853, "step": 75 }, { "epoch": 0.04186289900575615, "grad_norm": 0.5537857406833877, "learning_rate": 4.166666666666667e-06, "loss": 1.6142, "step": 80 }, { "epoch": 0.044479330193615906, "grad_norm": 0.5008944354387821, "learning_rate": 4.427083333333334e-06, "loss": 1.5458, "step": 85 }, { "epoch": 0.04709576138147567, "grad_norm": 0.4217272654697234, "learning_rate": 4.6875000000000004e-06, "loss": 1.5173, "step": 90 }, { "epoch": 0.04971219256933543, "grad_norm": 0.39469142742095514, "learning_rate": 4.947916666666667e-06, "loss": 1.5501, "step": 95 }, { "epoch": 0.052328623757195186, "grad_norm": 0.34359153398155595, "learning_rate": 5.208333333333334e-06, "loss": 1.5213, "step": 100 }, { "epoch": 0.054945054945054944, "grad_norm": 0.3099893379425512, "learning_rate": 5.468750000000001e-06, "loss": 1.6151, "step": 105 }, { "epoch": 0.0575614861329147, "grad_norm": 0.28278343285367047, "learning_rate": 5.729166666666667e-06, "loss": 1.514, "step": 110 }, { "epoch": 0.06017791732077447, "grad_norm": 0.25881346963420426, "learning_rate": 5.989583333333334e-06, "loss": 1.5072, "step": 115 }, { "epoch": 0.06279434850863422, "grad_norm": 0.24915604462449628, "learning_rate": 6.25e-06, "loss": 1.5199, "step": 120 }, { "epoch": 0.06541077969649398, "grad_norm": 0.2485517174096053, "learning_rate": 6.510416666666667e-06, "loss": 1.5366, "step": 125 }, { "epoch": 0.06802721088435375, "grad_norm": 0.22712811100348562, "learning_rate": 6.770833333333334e-06, "loss": 1.504, "step": 130 }, { "epoch": 0.0706436420722135, "grad_norm": 0.21446325271380284, "learning_rate": 7.031250000000001e-06, "loss": 1.5327, "step": 135 }, { "epoch": 0.07326007326007326, "grad_norm": 0.19651591348301906, "learning_rate": 7.291666666666667e-06, "loss": 1.4778, "step": 140 }, { "epoch": 0.07587650444793302, "grad_norm": 0.21362640404068758, "learning_rate": 7.552083333333334e-06, "loss": 1.4924, "step": 145 }, { "epoch": 0.07849293563579278, "grad_norm": 0.2086701893979055, "learning_rate": 7.8125e-06, "loss": 1.4934, "step": 150 }, { "epoch": 0.08110936682365254, "grad_norm": 0.19722374493413125, "learning_rate": 8.072916666666667e-06, "loss": 1.5194, "step": 155 }, { "epoch": 0.0837257980115123, "grad_norm": 0.197232116513064, "learning_rate": 8.333333333333334e-06, "loss": 1.5089, "step": 160 }, { "epoch": 0.08634222919937205, "grad_norm": 0.19736927732072057, "learning_rate": 8.59375e-06, "loss": 1.493, "step": 165 }, { "epoch": 0.08895866038723181, "grad_norm": 0.18999653216382684, "learning_rate": 8.854166666666667e-06, "loss": 1.5084, "step": 170 }, { "epoch": 0.09157509157509157, "grad_norm": 0.18816442922674095, "learning_rate": 9.114583333333334e-06, "loss": 1.4939, "step": 175 }, { "epoch": 0.09419152276295134, "grad_norm": 0.19457985638812048, "learning_rate": 9.375000000000001e-06, "loss": 1.5115, "step": 180 }, { "epoch": 0.0968079539508111, "grad_norm": 0.18306860662282393, "learning_rate": 9.635416666666668e-06, "loss": 1.51, "step": 185 }, { "epoch": 0.09942438513867086, "grad_norm": 0.1721853071759001, "learning_rate": 9.895833333333334e-06, "loss": 1.5121, "step": 190 }, { "epoch": 0.10204081632653061, "grad_norm": 0.18129124345256323, "learning_rate": 9.999924849924331e-06, "loss": 1.5169, "step": 195 }, { "epoch": 0.10465724751439037, "grad_norm": 0.17875223053180506, "learning_rate": 9.999465607642677e-06, "loss": 1.4897, "step": 200 }, { "epoch": 0.10727367870225013, "grad_norm": 0.1765984203759676, "learning_rate": 9.998588911421522e-06, "loss": 1.4531, "step": 205 }, { "epoch": 0.10989010989010989, "grad_norm": 0.16783670979039028, "learning_rate": 9.99729483446475e-06, "loss": 1.4821, "step": 210 }, { "epoch": 0.11250654107796965, "grad_norm": 0.17395143750443126, "learning_rate": 9.995583484827415e-06, "loss": 1.5229, "step": 215 }, { "epoch": 0.1151229722658294, "grad_norm": 0.16605004371887766, "learning_rate": 9.993455005406717e-06, "loss": 1.441, "step": 220 }, { "epoch": 0.11773940345368916, "grad_norm": 0.1700314362693217, "learning_rate": 9.990909573930075e-06, "loss": 1.4892, "step": 225 }, { "epoch": 0.12035583464154893, "grad_norm": 0.16948161898267894, "learning_rate": 9.987947402940285e-06, "loss": 1.5555, "step": 230 }, { "epoch": 0.12297226582940869, "grad_norm": 0.16463177711173213, "learning_rate": 9.984568739777776e-06, "loss": 1.4554, "step": 235 }, { "epoch": 0.12558869701726844, "grad_norm": 0.16770654591359932, "learning_rate": 9.980773866559946e-06, "loss": 1.4815, "step": 240 }, { "epoch": 0.1282051282051282, "grad_norm": 0.16640092407199292, "learning_rate": 9.976563100157615e-06, "loss": 1.4868, "step": 245 }, { "epoch": 0.13082155939298795, "grad_norm": 0.16699146902728054, "learning_rate": 9.971936792168569e-06, "loss": 1.4735, "step": 250 }, { "epoch": 0.13343799058084774, "grad_norm": 0.1743960884970157, "learning_rate": 9.966895328888195e-06, "loss": 1.4793, "step": 255 }, { "epoch": 0.1360544217687075, "grad_norm": 0.1702507585575381, "learning_rate": 9.961439131277223e-06, "loss": 1.4823, "step": 260 }, { "epoch": 0.13867085295656725, "grad_norm": 0.16235653984108958, "learning_rate": 9.955568654926582e-06, "loss": 1.4555, "step": 265 }, { "epoch": 0.141287284144427, "grad_norm": 0.15373757230170634, "learning_rate": 9.949284390019362e-06, "loss": 1.4332, "step": 270 }, { "epoch": 0.14390371533228677, "grad_norm": 0.1639742183162807, "learning_rate": 9.942586861289874e-06, "loss": 1.497, "step": 275 }, { "epoch": 0.14652014652014653, "grad_norm": 0.1632185984817443, "learning_rate": 9.935476627979837e-06, "loss": 1.4556, "step": 280 }, { "epoch": 0.14913657770800628, "grad_norm": 0.16327675062600183, "learning_rate": 9.927954283791687e-06, "loss": 1.4614, "step": 285 }, { "epoch": 0.15175300889586604, "grad_norm": 0.17441801496249207, "learning_rate": 9.920020456838998e-06, "loss": 1.5062, "step": 290 }, { "epoch": 0.1543694400837258, "grad_norm": 0.16030662489136668, "learning_rate": 9.911675809594042e-06, "loss": 1.4913, "step": 295 }, { "epoch": 0.15698587127158556, "grad_norm": 0.1573623101473936, "learning_rate": 9.902921038832456e-06, "loss": 1.4614, "step": 300 }, { "epoch": 0.15960230245944532, "grad_norm": 0.15793397911742527, "learning_rate": 9.893756875575082e-06, "loss": 1.4465, "step": 305 }, { "epoch": 0.16221873364730507, "grad_norm": 0.16568484828821042, "learning_rate": 9.884184085026918e-06, "loss": 1.4444, "step": 310 }, { "epoch": 0.16483516483516483, "grad_norm": 0.17311490884226297, "learning_rate": 9.874203466513215e-06, "loss": 1.4701, "step": 315 }, { "epoch": 0.1674515960230246, "grad_norm": 0.1721852603261551, "learning_rate": 9.863815853412748e-06, "loss": 1.4582, "step": 320 }, { "epoch": 0.17006802721088435, "grad_norm": 0.164698820484247, "learning_rate": 9.853022113088223e-06, "loss": 1.5308, "step": 325 }, { "epoch": 0.1726844583987441, "grad_norm": 0.1586763981745356, "learning_rate": 9.84182314681385e-06, "loss": 1.4744, "step": 330 }, { "epoch": 0.17530088958660386, "grad_norm": 0.15689735846007058, "learning_rate": 9.83021988970009e-06, "loss": 1.4753, "step": 335 }, { "epoch": 0.17791732077446362, "grad_norm": 0.1575630177309333, "learning_rate": 9.818213310615575e-06, "loss": 1.4579, "step": 340 }, { "epoch": 0.18053375196232338, "grad_norm": 0.1631768128880243, "learning_rate": 9.805804412106197e-06, "loss": 1.4541, "step": 345 }, { "epoch": 0.18315018315018314, "grad_norm": 0.15686973370987312, "learning_rate": 9.792994230311419e-06, "loss": 1.406, "step": 350 }, { "epoch": 0.1857666143380429, "grad_norm": 0.1605407693787364, "learning_rate": 9.779783834877727e-06, "loss": 1.4506, "step": 355 }, { "epoch": 0.18838304552590268, "grad_norm": 0.1566921205098481, "learning_rate": 9.766174328869344e-06, "loss": 1.4208, "step": 360 }, { "epoch": 0.19099947671376244, "grad_norm": 0.1558094611868576, "learning_rate": 9.752166848676101e-06, "loss": 1.4643, "step": 365 }, { "epoch": 0.1936159079016222, "grad_norm": 0.15672567518111116, "learning_rate": 9.737762563918564e-06, "loss": 1.4534, "step": 370 }, { "epoch": 0.19623233908948196, "grad_norm": 0.1535562279342849, "learning_rate": 9.722962677350367e-06, "loss": 1.4771, "step": 375 }, { "epoch": 0.1988487702773417, "grad_norm": 0.183419499409996, "learning_rate": 9.707768424757778e-06, "loss": 1.4415, "step": 380 }, { "epoch": 0.20146520146520147, "grad_norm": 0.16119980470890002, "learning_rate": 9.692181074856515e-06, "loss": 1.523, "step": 385 }, { "epoch": 0.20408163265306123, "grad_norm": 0.16455686201587852, "learning_rate": 9.676201929185809e-06, "loss": 1.4638, "step": 390 }, { "epoch": 0.206698063840921, "grad_norm": 0.15623412201622025, "learning_rate": 9.659832321999727e-06, "loss": 1.4955, "step": 395 }, { "epoch": 0.20931449502878074, "grad_norm": 0.15115523058807412, "learning_rate": 9.643073620155755e-06, "loss": 1.4509, "step": 400 }, { "epoch": 0.2119309262166405, "grad_norm": 0.16043581067705567, "learning_rate": 9.625927223000679e-06, "loss": 1.5064, "step": 405 }, { "epoch": 0.21454735740450026, "grad_norm": 0.16061562830754533, "learning_rate": 9.608394562253724e-06, "loss": 1.44, "step": 410 }, { "epoch": 0.21716378859236002, "grad_norm": 0.1610549978873481, "learning_rate": 9.590477101887016e-06, "loss": 1.5057, "step": 415 }, { "epoch": 0.21978021978021978, "grad_norm": 0.16201649106784555, "learning_rate": 9.572176338003341e-06, "loss": 1.4478, "step": 420 }, { "epoch": 0.22239665096807953, "grad_norm": 0.15700846939992966, "learning_rate": 9.553493798711217e-06, "loss": 1.4529, "step": 425 }, { "epoch": 0.2250130821559393, "grad_norm": 0.16614805498541974, "learning_rate": 9.534431043997298e-06, "loss": 1.4327, "step": 430 }, { "epoch": 0.22762951334379905, "grad_norm": 0.15339862150367098, "learning_rate": 9.514989665596114e-06, "loss": 1.468, "step": 435 }, { "epoch": 0.2302459445316588, "grad_norm": 0.16105247258500596, "learning_rate": 9.495171286857171e-06, "loss": 1.4999, "step": 440 }, { "epoch": 0.23286237571951857, "grad_norm": 0.14812137703125702, "learning_rate": 9.47497756260939e-06, "loss": 1.4194, "step": 445 }, { "epoch": 0.23547880690737832, "grad_norm": 0.15329658500251478, "learning_rate": 9.454410179022932e-06, "loss": 1.4579, "step": 450 }, { "epoch": 0.23809523809523808, "grad_norm": 0.1615690171559319, "learning_rate": 9.433470853468409e-06, "loss": 1.4626, "step": 455 }, { "epoch": 0.24071166928309787, "grad_norm": 0.1545180409071662, "learning_rate": 9.412161334373477e-06, "loss": 1.4709, "step": 460 }, { "epoch": 0.24332810047095763, "grad_norm": 0.15399340754206403, "learning_rate": 9.39048340107685e-06, "loss": 1.4776, "step": 465 }, { "epoch": 0.24594453165881738, "grad_norm": 0.15962881447905178, "learning_rate": 9.36843886367972e-06, "loss": 1.4593, "step": 470 }, { "epoch": 0.24856096284667714, "grad_norm": 0.1630934673570493, "learning_rate": 9.346029562894616e-06, "loss": 1.4681, "step": 475 }, { "epoch": 0.25117739403453687, "grad_norm": 0.1648479272141616, "learning_rate": 9.323257369891702e-06, "loss": 1.4364, "step": 480 }, { "epoch": 0.25379382522239663, "grad_norm": 0.17086807544238972, "learning_rate": 9.300124186142542e-06, "loss": 1.483, "step": 485 }, { "epoch": 0.2564102564102564, "grad_norm": 0.1554704282008905, "learning_rate": 9.276631943261325e-06, "loss": 1.439, "step": 490 }, { "epoch": 0.25902668759811615, "grad_norm": 0.15622543010651593, "learning_rate": 9.252782602843565e-06, "loss": 1.5031, "step": 495 }, { "epoch": 0.2616431187859759, "grad_norm": 0.16771215479185553, "learning_rate": 9.228578156302327e-06, "loss": 1.4406, "step": 500 }, { "epoch": 0.26425954997383566, "grad_norm": 0.16484311397184792, "learning_rate": 9.204020624701932e-06, "loss": 1.4803, "step": 505 }, { "epoch": 0.2668759811616955, "grad_norm": 0.16013346151268085, "learning_rate": 9.1791120585892e-06, "loss": 1.4161, "step": 510 }, { "epoch": 0.26949241234955523, "grad_norm": 0.16165617987217745, "learning_rate": 9.153854537822235e-06, "loss": 1.464, "step": 515 }, { "epoch": 0.272108843537415, "grad_norm": 0.15775246590680753, "learning_rate": 9.12825017139675e-06, "loss": 1.4856, "step": 520 }, { "epoch": 0.27472527472527475, "grad_norm": 0.15261801905816583, "learning_rate": 9.102301097269974e-06, "loss": 1.4921, "step": 525 }, { "epoch": 0.2773417059131345, "grad_norm": 0.17027676618685536, "learning_rate": 9.076009482182132e-06, "loss": 1.4594, "step": 530 }, { "epoch": 0.27995813710099426, "grad_norm": 0.15658397006356073, "learning_rate": 9.049377521475514e-06, "loss": 1.4449, "step": 535 }, { "epoch": 0.282574568288854, "grad_norm": 0.16413577886883224, "learning_rate": 9.022407438911177e-06, "loss": 1.4784, "step": 540 }, { "epoch": 0.2851909994767138, "grad_norm": 0.15420397374950906, "learning_rate": 8.99510148648325e-06, "loss": 1.4453, "step": 545 }, { "epoch": 0.28780743066457354, "grad_norm": 0.15750798306390343, "learning_rate": 8.967461944230908e-06, "loss": 1.4498, "step": 550 }, { "epoch": 0.2904238618524333, "grad_norm": 0.1631028988305524, "learning_rate": 8.939491120047974e-06, "loss": 1.4998, "step": 555 }, { "epoch": 0.29304029304029305, "grad_norm": 0.16042232539081466, "learning_rate": 8.911191349490215e-06, "loss": 1.4617, "step": 560 }, { "epoch": 0.2956567242281528, "grad_norm": 0.15786335167092433, "learning_rate": 8.882564995580329e-06, "loss": 1.4675, "step": 565 }, { "epoch": 0.29827315541601257, "grad_norm": 0.1509933414399425, "learning_rate": 8.85361444861063e-06, "loss": 1.449, "step": 570 }, { "epoch": 0.3008895866038723, "grad_norm": 0.16307057514008638, "learning_rate": 8.824342125943461e-06, "loss": 1.4394, "step": 575 }, { "epoch": 0.3035060177917321, "grad_norm": 0.15286343283349654, "learning_rate": 8.79475047180934e-06, "loss": 1.4926, "step": 580 }, { "epoch": 0.30612244897959184, "grad_norm": 0.15816557747601245, "learning_rate": 8.764841957102866e-06, "loss": 1.4991, "step": 585 }, { "epoch": 0.3087388801674516, "grad_norm": 0.15992999802079433, "learning_rate": 8.734619079176416e-06, "loss": 1.4967, "step": 590 }, { "epoch": 0.31135531135531136, "grad_norm": 0.15559994076241815, "learning_rate": 8.704084361631597e-06, "loss": 1.4602, "step": 595 }, { "epoch": 0.3139717425431711, "grad_norm": 0.16010926275335507, "learning_rate": 8.673240354108539e-06, "loss": 1.4233, "step": 600 }, { "epoch": 0.3165881737310309, "grad_norm": 0.16290886252895473, "learning_rate": 8.642089632072992e-06, "loss": 1.4039, "step": 605 }, { "epoch": 0.31920460491889063, "grad_norm": 0.1586363120366518, "learning_rate": 8.61063479660128e-06, "loss": 1.4828, "step": 610 }, { "epoch": 0.3218210361067504, "grad_norm": 0.16209454857234118, "learning_rate": 8.578878474163115e-06, "loss": 1.5014, "step": 615 }, { "epoch": 0.32443746729461015, "grad_norm": 0.15224653908638042, "learning_rate": 8.546823316402282e-06, "loss": 1.4809, "step": 620 }, { "epoch": 0.3270538984824699, "grad_norm": 0.159244210541448, "learning_rate": 8.514471999915229e-06, "loss": 1.4946, "step": 625 }, { "epoch": 0.32967032967032966, "grad_norm": 0.1521622865645663, "learning_rate": 8.48182722602757e-06, "loss": 1.4713, "step": 630 }, { "epoch": 0.3322867608581894, "grad_norm": 0.15381664517913313, "learning_rate": 8.448891720568535e-06, "loss": 1.4452, "step": 635 }, { "epoch": 0.3349031920460492, "grad_norm": 0.1582064130471886, "learning_rate": 8.415668233643346e-06, "loss": 1.4305, "step": 640 }, { "epoch": 0.33751962323390894, "grad_norm": 0.16426813337977975, "learning_rate": 8.382159539403605e-06, "loss": 1.5116, "step": 645 }, { "epoch": 0.3401360544217687, "grad_norm": 0.15908674556867114, "learning_rate": 8.348368435815636e-06, "loss": 1.4454, "step": 650 }, { "epoch": 0.34275248560962845, "grad_norm": 0.15658041474741916, "learning_rate": 8.314297744426865e-06, "loss": 1.4428, "step": 655 }, { "epoch": 0.3453689167974882, "grad_norm": 0.15747154649657244, "learning_rate": 8.279950310130218e-06, "loss": 1.4417, "step": 660 }, { "epoch": 0.34798534798534797, "grad_norm": 0.1589936285906929, "learning_rate": 8.245329000926574e-06, "loss": 1.4563, "step": 665 }, { "epoch": 0.35060177917320773, "grad_norm": 0.1613181591302504, "learning_rate": 8.210436707685286e-06, "loss": 1.4754, "step": 670 }, { "epoch": 0.3532182103610675, "grad_norm": 0.165332497765771, "learning_rate": 8.175276343902802e-06, "loss": 1.4547, "step": 675 }, { "epoch": 0.35583464154892724, "grad_norm": 0.1578563622480374, "learning_rate": 8.139850845459378e-06, "loss": 1.4565, "step": 680 }, { "epoch": 0.358451072736787, "grad_norm": 0.16107629875736493, "learning_rate": 8.104163170373942e-06, "loss": 1.4473, "step": 685 }, { "epoch": 0.36106750392464676, "grad_norm": 0.16625542644136174, "learning_rate": 8.068216298557088e-06, "loss": 1.4833, "step": 690 }, { "epoch": 0.3636839351125065, "grad_norm": 0.15680894783853527, "learning_rate": 8.032013231562271e-06, "loss": 1.4802, "step": 695 }, { "epoch": 0.3663003663003663, "grad_norm": 0.16181472227400934, "learning_rate": 7.995556992335168e-06, "loss": 1.4705, "step": 700 }, { "epoch": 0.36891679748822603, "grad_norm": 0.15686803017929454, "learning_rate": 7.95885062496126e-06, "loss": 1.4592, "step": 705 }, { "epoch": 0.3715332286760858, "grad_norm": 0.1579836788382813, "learning_rate": 7.92189719441166e-06, "loss": 1.4179, "step": 710 }, { "epoch": 0.3741496598639456, "grad_norm": 0.15713818521977135, "learning_rate": 7.884699786287188e-06, "loss": 1.4881, "step": 715 }, { "epoch": 0.37676609105180536, "grad_norm": 0.15909147673646215, "learning_rate": 7.847261506560716e-06, "loss": 1.4779, "step": 720 }, { "epoch": 0.3793825222396651, "grad_norm": 0.15891568593086702, "learning_rate": 7.809585481317824e-06, "loss": 1.4525, "step": 725 }, { "epoch": 0.3819989534275249, "grad_norm": 0.1555272930916928, "learning_rate": 7.77167485649578e-06, "loss": 1.4803, "step": 730 }, { "epoch": 0.38461538461538464, "grad_norm": 0.15344838974292666, "learning_rate": 7.733532797620849e-06, "loss": 1.4693, "step": 735 }, { "epoch": 0.3872318158032444, "grad_norm": 0.15915416421044593, "learning_rate": 7.695162489543966e-06, "loss": 1.5005, "step": 740 }, { "epoch": 0.38984824699110415, "grad_norm": 0.1642521243641894, "learning_rate": 7.656567136174817e-06, "loss": 1.4154, "step": 745 }, { "epoch": 0.3924646781789639, "grad_norm": 0.1522074237366889, "learning_rate": 7.6177499602143e-06, "loss": 1.4141, "step": 750 }, { "epoch": 0.39508110936682367, "grad_norm": 0.15826765900009956, "learning_rate": 7.578714202885436e-06, "loss": 1.491, "step": 755 }, { "epoch": 0.3976975405546834, "grad_norm": 0.16460953142733703, "learning_rate": 7.53946312366273e-06, "loss": 1.4553, "step": 760 }, { "epoch": 0.4003139717425432, "grad_norm": 0.16346924185292147, "learning_rate": 7.500000000000001e-06, "loss": 1.3906, "step": 765 }, { "epoch": 0.40293040293040294, "grad_norm": 0.16031939514368226, "learning_rate": 7.460328127056718e-06, "loss": 1.4978, "step": 770 }, { "epoch": 0.4055468341182627, "grad_norm": 0.1582953066263729, "learning_rate": 7.420450817422855e-06, "loss": 1.4618, "step": 775 }, { "epoch": 0.40816326530612246, "grad_norm": 0.16134419804701114, "learning_rate": 7.38037140084229e-06, "loss": 1.4654, "step": 780 }, { "epoch": 0.4107796964939822, "grad_norm": 0.1590857338863284, "learning_rate": 7.340093223934775e-06, "loss": 1.4632, "step": 785 }, { "epoch": 0.413396127681842, "grad_norm": 0.1599207804357529, "learning_rate": 7.29961964991649e-06, "loss": 1.4222, "step": 790 }, { "epoch": 0.41601255886970173, "grad_norm": 0.16680739008193485, "learning_rate": 7.2589540583192165e-06, "loss": 1.5157, "step": 795 }, { "epoch": 0.4186289900575615, "grad_norm": 0.15668314190198573, "learning_rate": 7.218099844708152e-06, "loss": 1.4838, "step": 800 }, { "epoch": 0.42124542124542125, "grad_norm": 0.16035073862413637, "learning_rate": 7.177060420398376e-06, "loss": 1.4573, "step": 805 }, { "epoch": 0.423861852433281, "grad_norm": 0.15900953162107145, "learning_rate": 7.135839212170008e-06, "loss": 1.4492, "step": 810 }, { "epoch": 0.42647828362114076, "grad_norm": 0.16479547257307764, "learning_rate": 7.094439661982072e-06, "loss": 1.4661, "step": 815 }, { "epoch": 0.4290947148090005, "grad_norm": 0.15392631508278395, "learning_rate": 7.0528652266850935e-06, "loss": 1.4521, "step": 820 }, { "epoch": 0.4317111459968603, "grad_norm": 0.15684281936206454, "learning_rate": 7.011119377732459e-06, "loss": 1.4509, "step": 825 }, { "epoch": 0.43432757718472004, "grad_norm": 0.16360635983799987, "learning_rate": 6.969205600890539e-06, "loss": 1.4661, "step": 830 }, { "epoch": 0.4369440083725798, "grad_norm": 0.15446157313570677, "learning_rate": 6.9271273959476415e-06, "loss": 1.4724, "step": 835 }, { "epoch": 0.43956043956043955, "grad_norm": 0.15684403662844304, "learning_rate": 6.884888276421766e-06, "loss": 1.4417, "step": 840 }, { "epoch": 0.4421768707482993, "grad_norm": 0.1623177931087071, "learning_rate": 6.842491769267241e-06, "loss": 1.4657, "step": 845 }, { "epoch": 0.44479330193615907, "grad_norm": 0.15191793713718904, "learning_rate": 6.79994141458021e-06, "loss": 1.4758, "step": 850 }, { "epoch": 0.4474097331240188, "grad_norm": 0.1580816385183261, "learning_rate": 6.757240765303047e-06, "loss": 1.4354, "step": 855 }, { "epoch": 0.4500261643118786, "grad_norm": 0.15469805356817723, "learning_rate": 6.7143933869276755e-06, "loss": 1.423, "step": 860 }, { "epoch": 0.45264259549973834, "grad_norm": 0.15867250364888047, "learning_rate": 6.671402857197864e-06, "loss": 1.4333, "step": 865 }, { "epoch": 0.4552590266875981, "grad_norm": 0.15450873366986678, "learning_rate": 6.628272765810468e-06, "loss": 1.5001, "step": 870 }, { "epoch": 0.45787545787545786, "grad_norm": 0.15904540784415175, "learning_rate": 6.585006714115709e-06, "loss": 1.4568, "step": 875 }, { "epoch": 0.4604918890633176, "grad_norm": 0.18783657076424265, "learning_rate": 6.541608314816451e-06, "loss": 1.4828, "step": 880 }, { "epoch": 0.4631083202511774, "grad_norm": 0.15633907898033594, "learning_rate": 6.498081191666549e-06, "loss": 1.3961, "step": 885 }, { "epoch": 0.46572475143903713, "grad_norm": 0.1562637792140999, "learning_rate": 6.454428979168257e-06, "loss": 1.4612, "step": 890 }, { "epoch": 0.4683411826268969, "grad_norm": 0.16274637193792332, "learning_rate": 6.410655322268758e-06, "loss": 1.4744, "step": 895 }, { "epoch": 0.47095761381475665, "grad_norm": 0.16139617681241109, "learning_rate": 6.3667638760558055e-06, "loss": 1.4937, "step": 900 }, { "epoch": 0.4735740450026164, "grad_norm": 0.16266632807711848, "learning_rate": 6.3227583054525296e-06, "loss": 1.4359, "step": 905 }, { "epoch": 0.47619047619047616, "grad_norm": 0.15220372261088064, "learning_rate": 6.2786422849114074e-06, "loss": 1.4534, "step": 910 }, { "epoch": 0.478806907378336, "grad_norm": 0.1585996226107583, "learning_rate": 6.2344194981074616e-06, "loss": 1.4731, "step": 915 }, { "epoch": 0.48142333856619574, "grad_norm": 0.164545533910534, "learning_rate": 6.190093637630662e-06, "loss": 1.4888, "step": 920 }, { "epoch": 0.4840397697540555, "grad_norm": 0.16298648791233225, "learning_rate": 6.145668404677604e-06, "loss": 1.4198, "step": 925 }, { "epoch": 0.48665620094191525, "grad_norm": 0.16942313057138814, "learning_rate": 6.101147508742456e-06, "loss": 1.4351, "step": 930 }, { "epoch": 0.489272632129775, "grad_norm": 0.15711211853308713, "learning_rate": 6.056534667307212e-06, "loss": 1.4644, "step": 935 }, { "epoch": 0.49188906331763477, "grad_norm": 0.15626699824988666, "learning_rate": 6.011833605531295e-06, "loss": 1.4304, "step": 940 }, { "epoch": 0.4945054945054945, "grad_norm": 0.17131121620072295, "learning_rate": 5.967048055940503e-06, "loss": 1.4363, "step": 945 }, { "epoch": 0.4971219256933543, "grad_norm": 0.16411315909203975, "learning_rate": 5.922181758115333e-06, "loss": 1.4948, "step": 950 }, { "epoch": 0.49973835688121404, "grad_norm": 0.1652087290533154, "learning_rate": 5.8772384583787455e-06, "loss": 1.4749, "step": 955 }, { "epoch": 0.5023547880690737, "grad_norm": 0.1527872810679704, "learning_rate": 5.832221909483334e-06, "loss": 1.5299, "step": 960 }, { "epoch": 0.5049712192569336, "grad_norm": 0.16242876178871807, "learning_rate": 5.787135870297976e-06, "loss": 1.4302, "step": 965 }, { "epoch": 0.5075876504447933, "grad_norm": 0.15914276616432216, "learning_rate": 5.741984105493967e-06, "loss": 1.4855, "step": 970 }, { "epoch": 0.5102040816326531, "grad_norm": 0.15942651171220024, "learning_rate": 5.696770385230679e-06, "loss": 1.4426, "step": 975 }, { "epoch": 0.5128205128205128, "grad_norm": 0.15807600841433533, "learning_rate": 5.651498484840737e-06, "loss": 1.4827, "step": 980 }, { "epoch": 0.5154369440083726, "grad_norm": 0.15731445684573817, "learning_rate": 5.6061721845148e-06, "loss": 1.4354, "step": 985 }, { "epoch": 0.5180533751962323, "grad_norm": 0.15827375236560648, "learning_rate": 5.560795268985899e-06, "loss": 1.442, "step": 990 }, { "epoch": 0.5206698063840921, "grad_norm": 0.1578476109009046, "learning_rate": 5.515371527213422e-06, "loss": 1.4272, "step": 995 }, { "epoch": 0.5232862375719518, "grad_norm": 0.161165877259809, "learning_rate": 5.469904752066736e-06, "loss": 1.4602, "step": 1000 }, { "epoch": 0.5259026687598116, "grad_norm": 0.15792406612385534, "learning_rate": 5.424398740008481e-06, "loss": 1.4762, "step": 1005 }, { "epoch": 0.5285190999476713, "grad_norm": 0.16143656535191364, "learning_rate": 5.378857290777566e-06, "loss": 1.4597, "step": 1010 }, { "epoch": 0.5311355311355311, "grad_norm": 0.1525842123237197, "learning_rate": 5.333284207071901e-06, "loss": 1.4559, "step": 1015 }, { "epoch": 0.533751962323391, "grad_norm": 0.1600143521665237, "learning_rate": 5.287683294230855e-06, "loss": 1.4993, "step": 1020 }, { "epoch": 0.5363683935112507, "grad_norm": 0.1671409495356182, "learning_rate": 5.242058359917531e-06, "loss": 1.4362, "step": 1025 }, { "epoch": 0.5389848246991105, "grad_norm": 0.16601362069808856, "learning_rate": 5.196413213800812e-06, "loss": 1.4371, "step": 1030 }, { "epoch": 0.5416012558869702, "grad_norm": 0.15498287472397484, "learning_rate": 5.150751667237266e-06, "loss": 1.4294, "step": 1035 }, { "epoch": 0.54421768707483, "grad_norm": 0.15961404392992412, "learning_rate": 5.1050775329528865e-06, "loss": 1.4789, "step": 1040 }, { "epoch": 0.5468341182626897, "grad_norm": 0.15949553392591714, "learning_rate": 5.059394624724749e-06, "loss": 1.4851, "step": 1045 }, { "epoch": 0.5494505494505495, "grad_norm": 0.16116839795222396, "learning_rate": 5.0137067570625345e-06, "loss": 1.4458, "step": 1050 }, { "epoch": 0.5520669806384092, "grad_norm": 0.15971613721230996, "learning_rate": 4.968017744890052e-06, "loss": 1.4697, "step": 1055 }, { "epoch": 0.554683411826269, "grad_norm": 0.16118914480125368, "learning_rate": 4.922331403226667e-06, "loss": 1.4442, "step": 1060 }, { "epoch": 0.5572998430141287, "grad_norm": 0.1620932053747273, "learning_rate": 4.876651546868759e-06, "loss": 1.4814, "step": 1065 }, { "epoch": 0.5599162742019885, "grad_norm": 0.15581908215479442, "learning_rate": 4.830981990071193e-06, "loss": 1.4428, "step": 1070 }, { "epoch": 0.5625327053898482, "grad_norm": 0.155369479068761, "learning_rate": 4.785326546228818e-06, "loss": 1.4835, "step": 1075 }, { "epoch": 0.565149136577708, "grad_norm": 0.16291117247601922, "learning_rate": 4.739689027558052e-06, "loss": 1.443, "step": 1080 }, { "epoch": 0.5677655677655677, "grad_norm": 0.15825965604814435, "learning_rate": 4.694073244778571e-06, "loss": 1.4788, "step": 1085 }, { "epoch": 0.5703819989534276, "grad_norm": 0.16221449839650126, "learning_rate": 4.648483006795115e-06, "loss": 1.4565, "step": 1090 }, { "epoch": 0.5729984301412873, "grad_norm": 0.1631035307892455, "learning_rate": 4.602922120379432e-06, "loss": 1.423, "step": 1095 }, { "epoch": 0.5756148613291471, "grad_norm": 0.15523741305430758, "learning_rate": 4.557394389852427e-06, "loss": 1.4925, "step": 1100 }, { "epoch": 0.5782312925170068, "grad_norm": 0.16229543459181653, "learning_rate": 4.5119036167664966e-06, "loss": 1.4646, "step": 1105 }, { "epoch": 0.5808477237048666, "grad_norm": 0.17203262275951273, "learning_rate": 4.466453599588103e-06, "loss": 1.4874, "step": 1110 }, { "epoch": 0.5834641548927263, "grad_norm": 0.1523478817379919, "learning_rate": 4.421048133380601e-06, "loss": 1.4031, "step": 1115 }, { "epoch": 0.5860805860805861, "grad_norm": 0.15873848031137183, "learning_rate": 4.375691009487351e-06, "loss": 1.4796, "step": 1120 }, { "epoch": 0.5886970172684458, "grad_norm": 0.16337788296807224, "learning_rate": 4.330386015215145e-06, "loss": 1.5251, "step": 1125 }, { "epoch": 0.5913134484563056, "grad_norm": 0.1602836054740823, "learning_rate": 4.285136933517971e-06, "loss": 1.4303, "step": 1130 }, { "epoch": 0.5939298796441653, "grad_norm": 0.15647438547435116, "learning_rate": 4.239947542681125e-06, "loss": 1.4039, "step": 1135 }, { "epoch": 0.5965463108320251, "grad_norm": 0.15857408026812378, "learning_rate": 4.194821616005738e-06, "loss": 1.3983, "step": 1140 }, { "epoch": 0.5991627420198848, "grad_norm": 0.15775093783775374, "learning_rate": 4.1497629214937e-06, "loss": 1.4372, "step": 1145 }, { "epoch": 0.6017791732077447, "grad_norm": 0.18509829862091232, "learning_rate": 4.104775221533039e-06, "loss": 1.4806, "step": 1150 }, { "epoch": 0.6043956043956044, "grad_norm": 0.1683414103321442, "learning_rate": 4.059862272583755e-06, "loss": 1.4842, "step": 1155 }, { "epoch": 0.6070120355834642, "grad_norm": 0.15725746906302956, "learning_rate": 4.015027824864158e-06, "loss": 1.474, "step": 1160 }, { "epoch": 0.6096284667713239, "grad_norm": 0.156823916865726, "learning_rate": 3.97027562203773e-06, "loss": 1.4457, "step": 1165 }, { "epoch": 0.6122448979591837, "grad_norm": 0.16196818890564343, "learning_rate": 3.92560940090053e-06, "loss": 1.4549, "step": 1170 }, { "epoch": 0.6148613291470434, "grad_norm": 0.16431998359127215, "learning_rate": 3.881032891069169e-06, "loss": 1.4569, "step": 1175 }, { "epoch": 0.6174777603349032, "grad_norm": 0.1624294435977778, "learning_rate": 3.836549814669389e-06, "loss": 1.4707, "step": 1180 }, { "epoch": 0.6200941915227629, "grad_norm": 0.15807576657487513, "learning_rate": 3.7921638860252674e-06, "loss": 1.4445, "step": 1185 }, { "epoch": 0.6227106227106227, "grad_norm": 0.15852155364559742, "learning_rate": 3.747878811349075e-06, "loss": 1.4828, "step": 1190 }, { "epoch": 0.6253270538984824, "grad_norm": 0.1646770963198663, "learning_rate": 3.703698288431801e-06, "loss": 1.4424, "step": 1195 }, { "epoch": 0.6279434850863422, "grad_norm": 0.1603001093766745, "learning_rate": 3.659626006334395e-06, "loss": 1.4539, "step": 1200 }, { "epoch": 0.6305599162742019, "grad_norm": 0.15832268935743676, "learning_rate": 3.615665645079728e-06, "loss": 1.4581, "step": 1205 }, { "epoch": 0.6331763474620618, "grad_norm": 0.15675764863415026, "learning_rate": 3.5718208753453166e-06, "loss": 1.5017, "step": 1210 }, { "epoch": 0.6357927786499215, "grad_norm": 0.1564827555990543, "learning_rate": 3.5280953581568155e-06, "loss": 1.4583, "step": 1215 }, { "epoch": 0.6384092098377813, "grad_norm": 0.1564902878878436, "learning_rate": 3.484492744582325e-06, "loss": 1.4566, "step": 1220 }, { "epoch": 0.6410256410256411, "grad_norm": 0.15668534020828204, "learning_rate": 3.441016675427532e-06, "loss": 1.4551, "step": 1225 }, { "epoch": 0.6436420722135008, "grad_norm": 0.15413678756262972, "learning_rate": 3.397670780931699e-06, "loss": 1.4326, "step": 1230 }, { "epoch": 0.6462585034013606, "grad_norm": 0.15578687601295582, "learning_rate": 3.354458680464543e-06, "loss": 1.4317, "step": 1235 }, { "epoch": 0.6488749345892203, "grad_norm": 0.1638562831165353, "learning_rate": 3.311383982224017e-06, "loss": 1.4472, "step": 1240 }, { "epoch": 0.6514913657770801, "grad_norm": 0.16594544182894133, "learning_rate": 3.268450282935026e-06, "loss": 1.4729, "step": 1245 }, { "epoch": 0.6541077969649398, "grad_norm": 0.16046123770729828, "learning_rate": 3.2256611675491096e-06, "loss": 1.4583, "step": 1250 }, { "epoch": 0.6567242281527996, "grad_norm": 0.16513808379274897, "learning_rate": 3.183020208945086e-06, "loss": 1.4484, "step": 1255 }, { "epoch": 0.6593406593406593, "grad_norm": 0.16412228072472856, "learning_rate": 3.1405309676307283e-06, "loss": 1.4906, "step": 1260 }, { "epoch": 0.6619570905285191, "grad_norm": 0.16590678276919818, "learning_rate": 3.0981969914454555e-06, "loss": 1.4856, "step": 1265 }, { "epoch": 0.6645735217163788, "grad_norm": 0.1687948653068824, "learning_rate": 3.056021815264102e-06, "loss": 1.4641, "step": 1270 }, { "epoch": 0.6671899529042387, "grad_norm": 0.1627828541693421, "learning_rate": 3.0140089607017386e-06, "loss": 1.494, "step": 1275 }, { "epoch": 0.6698063840920984, "grad_norm": 0.15269049435378884, "learning_rate": 2.972161935819632e-06, "loss": 1.4347, "step": 1280 }, { "epoch": 0.6724228152799582, "grad_norm": 0.15762990269917085, "learning_rate": 2.930484234832315e-06, "loss": 1.4694, "step": 1285 }, { "epoch": 0.6750392464678179, "grad_norm": 0.15508871368297228, "learning_rate": 2.8889793378158284e-06, "loss": 1.4402, "step": 1290 }, { "epoch": 0.6776556776556777, "grad_norm": 0.17290914975265276, "learning_rate": 2.8476507104171273e-06, "loss": 1.4618, "step": 1295 }, { "epoch": 0.6802721088435374, "grad_norm": 0.15581225533545537, "learning_rate": 2.806501803564708e-06, "loss": 1.472, "step": 1300 }, { "epoch": 0.6828885400313972, "grad_norm": 0.16127501163040442, "learning_rate": 2.765536053180447e-06, "loss": 1.4563, "step": 1305 }, { "epoch": 0.6855049712192569, "grad_norm": 0.16320911225559154, "learning_rate": 2.724756879892717e-06, "loss": 1.4605, "step": 1310 }, { "epoch": 0.6881214024071167, "grad_norm": 0.16165957297000277, "learning_rate": 2.6841676887507505e-06, "loss": 1.4384, "step": 1315 }, { "epoch": 0.6907378335949764, "grad_norm": 0.15751845797115352, "learning_rate": 2.643771868940327e-06, "loss": 1.4584, "step": 1320 }, { "epoch": 0.6933542647828362, "grad_norm": 0.1540695191316521, "learning_rate": 2.603572793500775e-06, "loss": 1.4408, "step": 1325 }, { "epoch": 0.6959706959706959, "grad_norm": 0.1550335651788221, "learning_rate": 2.5635738190433252e-06, "loss": 1.4369, "step": 1330 }, { "epoch": 0.6985871271585558, "grad_norm": 0.16362988015639554, "learning_rate": 2.523778285470835e-06, "loss": 1.4822, "step": 1335 }, { "epoch": 0.7012035583464155, "grad_norm": 0.15606895225172598, "learning_rate": 2.4841895156989047e-06, "loss": 1.4672, "step": 1340 }, { "epoch": 0.7038199895342753, "grad_norm": 0.1615627045553084, "learning_rate": 2.444810815378416e-06, "loss": 1.4105, "step": 1345 }, { "epoch": 0.706436420722135, "grad_norm": 0.16228564404934667, "learning_rate": 2.4056454726195166e-06, "loss": 1.4706, "step": 1350 }, { "epoch": 0.7090528519099948, "grad_norm": 0.16011002705477617, "learning_rate": 2.366696757717054e-06, "loss": 1.4392, "step": 1355 }, { "epoch": 0.7116692830978545, "grad_norm": 0.15870675982792665, "learning_rate": 2.327967922877515e-06, "loss": 1.5238, "step": 1360 }, { "epoch": 0.7142857142857143, "grad_norm": 0.15138917306239938, "learning_rate": 2.28946220194746e-06, "loss": 1.4617, "step": 1365 }, { "epoch": 0.716902145473574, "grad_norm": 0.16468665101161267, "learning_rate": 2.2511828101435105e-06, "loss": 1.473, "step": 1370 }, { "epoch": 0.7195185766614338, "grad_norm": 0.15846712332838853, "learning_rate": 2.213132943783864e-06, "loss": 1.4154, "step": 1375 }, { "epoch": 0.7221350078492935, "grad_norm": 0.1586190639294726, "learning_rate": 2.1753157800214107e-06, "loss": 1.4482, "step": 1380 }, { "epoch": 0.7247514390371533, "grad_norm": 0.15881895479160194, "learning_rate": 2.137734476578443e-06, "loss": 1.4911, "step": 1385 }, { "epoch": 0.727367870225013, "grad_norm": 0.16286201033342562, "learning_rate": 2.1003921714829823e-06, "loss": 1.4584, "step": 1390 }, { "epoch": 0.7299843014128728, "grad_norm": 0.15788330405126516, "learning_rate": 2.063291982806759e-06, "loss": 1.4102, "step": 1395 }, { "epoch": 0.7326007326007326, "grad_norm": 0.15934473574485228, "learning_rate": 2.0264370084048498e-06, "loss": 1.4932, "step": 1400 }, { "epoch": 0.7352171637885924, "grad_norm": 0.16436224664001722, "learning_rate": 1.9898303256570093e-06, "loss": 1.4216, "step": 1405 }, { "epoch": 0.7378335949764521, "grad_norm": 0.15573035343733022, "learning_rate": 1.953474991210717e-06, "loss": 1.4545, "step": 1410 }, { "epoch": 0.7404500261643119, "grad_norm": 0.15560461400003067, "learning_rate": 1.917374040725935e-06, "loss": 1.4646, "step": 1415 }, { "epoch": 0.7430664573521716, "grad_norm": 0.1577474998263538, "learning_rate": 1.8815304886216385e-06, "loss": 1.4534, "step": 1420 }, { "epoch": 0.7456828885400314, "grad_norm": 0.15854274323074571, "learning_rate": 1.8459473278241125e-06, "loss": 1.417, "step": 1425 }, { "epoch": 0.7482993197278912, "grad_norm": 0.15488543431357468, "learning_rate": 1.8106275295170462e-06, "loss": 1.4453, "step": 1430 }, { "epoch": 0.7509157509157509, "grad_norm": 0.1486710066323424, "learning_rate": 1.7755740428934333e-06, "loss": 1.4146, "step": 1435 }, { "epoch": 0.7535321821036107, "grad_norm": 0.15860342482210135, "learning_rate": 1.7407897949093184e-06, "loss": 1.4131, "step": 1440 }, { "epoch": 0.7561486132914704, "grad_norm": 0.1498596914474104, "learning_rate": 1.7062776900393979e-06, "loss": 1.4928, "step": 1445 }, { "epoch": 0.7587650444793302, "grad_norm": 0.16559110824107307, "learning_rate": 1.6720406100344977e-06, "loss": 1.455, "step": 1450 }, { "epoch": 0.7613814756671899, "grad_norm": 0.15672560186933324, "learning_rate": 1.6380814136809442e-06, "loss": 1.4384, "step": 1455 }, { "epoch": 0.7639979068550498, "grad_norm": 0.15929938335939636, "learning_rate": 1.6044029365618612e-06, "loss": 1.4048, "step": 1460 }, { "epoch": 0.7666143380429095, "grad_norm": 0.15082068272349816, "learning_rate": 1.571007990820394e-06, "loss": 1.4777, "step": 1465 }, { "epoch": 0.7692307692307693, "grad_norm": 0.1551606972940531, "learning_rate": 1.5378993649249053e-06, "loss": 1.4509, "step": 1470 }, { "epoch": 0.771847200418629, "grad_norm": 0.16555549980007303, "learning_rate": 1.5050798234361269e-06, "loss": 1.4896, "step": 1475 }, { "epoch": 0.7744636316064888, "grad_norm": 0.15106932577324084, "learning_rate": 1.4725521067763298e-06, "loss": 1.4298, "step": 1480 }, { "epoch": 0.7770800627943485, "grad_norm": 0.16881824215449678, "learning_rate": 1.4403189310004917e-06, "loss": 1.4626, "step": 1485 }, { "epoch": 0.7796964939822083, "grad_norm": 0.15714915915750127, "learning_rate": 1.4083829875695172e-06, "loss": 1.4336, "step": 1490 }, { "epoch": 0.782312925170068, "grad_norm": 0.15042384246168655, "learning_rate": 1.376746943125491e-06, "loss": 1.4531, "step": 1495 }, { "epoch": 0.7849293563579278, "grad_norm": 0.16251411929370363, "learning_rate": 1.34541343926902e-06, "loss": 1.4554, "step": 1500 }, { "epoch": 0.7875457875457875, "grad_norm": 0.16135250106988436, "learning_rate": 1.3143850923386586e-06, "loss": 1.4649, "step": 1505 }, { "epoch": 0.7901622187336473, "grad_norm": 0.16618023255091524, "learning_rate": 1.2836644931924469e-06, "loss": 1.4879, "step": 1510 }, { "epoch": 0.792778649921507, "grad_norm": 0.15813395048358223, "learning_rate": 1.2532542069915722e-06, "loss": 1.4397, "step": 1515 }, { "epoch": 0.7953950811093669, "grad_norm": 0.15701143348776167, "learning_rate": 1.2231567729861809e-06, "loss": 1.4261, "step": 1520 }, { "epoch": 0.7980115122972266, "grad_norm": 0.1603189121004618, "learning_rate": 1.1933747043033505e-06, "loss": 1.4486, "step": 1525 }, { "epoch": 0.8006279434850864, "grad_norm": 0.1593742047063577, "learning_rate": 1.1639104877372475e-06, "loss": 1.4691, "step": 1530 }, { "epoch": 0.8032443746729461, "grad_norm": 0.16867515846268982, "learning_rate": 1.134766583541475e-06, "loss": 1.4516, "step": 1535 }, { "epoch": 0.8058608058608059, "grad_norm": 0.16003034328500954, "learning_rate": 1.1059454252236457e-06, "loss": 1.4664, "step": 1540 }, { "epoch": 0.8084772370486656, "grad_norm": 0.15168555205033307, "learning_rate": 1.0774494193421842e-06, "loss": 1.4613, "step": 1545 }, { "epoch": 0.8110936682365254, "grad_norm": 0.15563769398367197, "learning_rate": 1.0492809453053836e-06, "loss": 1.4464, "step": 1550 }, { "epoch": 0.8137100994243851, "grad_norm": 0.1685907836889909, "learning_rate": 1.0214423551727188e-06, "loss": 1.4501, "step": 1555 }, { "epoch": 0.8163265306122449, "grad_norm": 0.1577575034779687, "learning_rate": 9.939359734584552e-07, "loss": 1.4204, "step": 1560 }, { "epoch": 0.8189429618001046, "grad_norm": 0.15781696179307608, "learning_rate": 9.667640969375465e-07, "loss": 1.4233, "step": 1565 }, { "epoch": 0.8215593929879644, "grad_norm": 0.1631762144058873, "learning_rate": 9.399289944538664e-07, "loss": 1.4701, "step": 1570 }, { "epoch": 0.8241758241758241, "grad_norm": 0.158857072715717, "learning_rate": 9.134329067307485e-07, "loss": 1.4929, "step": 1575 }, { "epoch": 0.826792255363684, "grad_norm": 0.16115217693002612, "learning_rate": 8.872780461838931e-07, "loss": 1.4778, "step": 1580 }, { "epoch": 0.8294086865515437, "grad_norm": 0.1569960900935591, "learning_rate": 8.614665967366276e-07, "loss": 1.4912, "step": 1585 }, { "epoch": 0.8320251177394035, "grad_norm": 0.16399438070889508, "learning_rate": 8.360007136375553e-07, "loss": 1.4597, "step": 1590 }, { "epoch": 0.8346415489272632, "grad_norm": 0.1565164124394842, "learning_rate": 8.108825232805856e-07, "loss": 1.4573, "step": 1595 }, { "epoch": 0.837257980115123, "grad_norm": 0.15301717777573082, "learning_rate": 7.861141230273839e-07, "loss": 1.432, "step": 1600 }, { "epoch": 0.8398744113029827, "grad_norm": 0.16280701231358558, "learning_rate": 7.61697581032243e-07, "loss": 1.4701, "step": 1605 }, { "epoch": 0.8424908424908425, "grad_norm": 0.1542078888615764, "learning_rate": 7.376349360693952e-07, "loss": 1.4416, "step": 1610 }, { "epoch": 0.8451072736787022, "grad_norm": 0.15896265508191268, "learning_rate": 7.139281973627693e-07, "loss": 1.5054, "step": 1615 }, { "epoch": 0.847723704866562, "grad_norm": 0.15547362210975574, "learning_rate": 6.905793444182257e-07, "loss": 1.4047, "step": 1620 }, { "epoch": 0.8503401360544217, "grad_norm": 0.15640953304641333, "learning_rate": 6.675903268582623e-07, "loss": 1.4447, "step": 1625 }, { "epoch": 0.8529565672422815, "grad_norm": 0.1648337946012005, "learning_rate": 6.449630642592336e-07, "loss": 1.4324, "step": 1630 }, { "epoch": 0.8555729984301413, "grad_norm": 0.16020988675812617, "learning_rate": 6.22699445991054e-07, "loss": 1.5053, "step": 1635 }, { "epoch": 0.858189429618001, "grad_norm": 0.16301957953770885, "learning_rate": 6.008013310594418e-07, "loss": 1.4225, "step": 1640 }, { "epoch": 0.8608058608058609, "grad_norm": 0.15536283216420355, "learning_rate": 5.7927054795069e-07, "loss": 1.4632, "step": 1645 }, { "epoch": 0.8634222919937206, "grad_norm": 0.15402900867791064, "learning_rate": 5.581088944789953e-07, "loss": 1.4617, "step": 1650 }, { "epoch": 0.8660387231815804, "grad_norm": 0.15324358115634612, "learning_rate": 5.373181376363312e-07, "loss": 1.4933, "step": 1655 }, { "epoch": 0.8686551543694401, "grad_norm": 0.16150842206673804, "learning_rate": 5.169000134449115e-07, "loss": 1.4881, "step": 1660 }, { "epoch": 0.8712715855572999, "grad_norm": 0.1563492858079007, "learning_rate": 4.968562268122285e-07, "loss": 1.4782, "step": 1665 }, { "epoch": 0.8738880167451596, "grad_norm": 0.15656351967720294, "learning_rate": 4.771884513886998e-07, "loss": 1.4926, "step": 1670 }, { "epoch": 0.8765044479330194, "grad_norm": 0.15968896687575132, "learning_rate": 4.578983294279138e-07, "loss": 1.4465, "step": 1675 }, { "epoch": 0.8791208791208791, "grad_norm": 0.15840577744694784, "learning_rate": 4.389874716495013e-07, "loss": 1.471, "step": 1680 }, { "epoch": 0.8817373103087389, "grad_norm": 0.15537710474993283, "learning_rate": 4.204574571046438e-07, "loss": 1.4827, "step": 1685 }, { "epoch": 0.8843537414965986, "grad_norm": 0.1596282006005663, "learning_rate": 4.0230983304422543e-07, "loss": 1.4873, "step": 1690 }, { "epoch": 0.8869701726844584, "grad_norm": 0.1610797994580487, "learning_rate": 3.8454611478963235e-07, "loss": 1.4998, "step": 1695 }, { "epoch": 0.8895866038723181, "grad_norm": 0.16534664335538504, "learning_rate": 3.671677856062261e-07, "loss": 1.4371, "step": 1700 }, { "epoch": 0.892203035060178, "grad_norm": 0.1542092085511985, "learning_rate": 3.501762965794919e-07, "loss": 1.4787, "step": 1705 }, { "epoch": 0.8948194662480377, "grad_norm": 0.15126499803017035, "learning_rate": 3.335730664938758e-07, "loss": 1.4482, "step": 1710 }, { "epoch": 0.8974358974358975, "grad_norm": 0.1574221424399337, "learning_rate": 3.1735948171431e-07, "loss": 1.4415, "step": 1715 }, { "epoch": 0.9000523286237572, "grad_norm": 0.15973254318652566, "learning_rate": 3.015368960704584e-07, "loss": 1.4944, "step": 1720 }, { "epoch": 0.902668759811617, "grad_norm": 0.16225657403102964, "learning_rate": 2.8610663074366773e-07, "loss": 1.4666, "step": 1725 }, { "epoch": 0.9052851909994767, "grad_norm": 0.15648772554662277, "learning_rate": 2.7106997415665527e-07, "loss": 1.4265, "step": 1730 }, { "epoch": 0.9079016221873365, "grad_norm": 0.15343388907268007, "learning_rate": 2.564281818659159e-07, "loss": 1.4214, "step": 1735 }, { "epoch": 0.9105180533751962, "grad_norm": 0.15794655302835334, "learning_rate": 2.4218247645689306e-07, "loss": 1.5033, "step": 1740 }, { "epoch": 0.913134484563056, "grad_norm": 0.16018691162962145, "learning_rate": 2.2833404744188824e-07, "loss": 1.4402, "step": 1745 }, { "epoch": 0.9157509157509157, "grad_norm": 0.15874745650816893, "learning_rate": 2.1488405116074028e-07, "loss": 1.4202, "step": 1750 }, { "epoch": 0.9183673469387755, "grad_norm": 0.1681304342474622, "learning_rate": 2.0183361068426778e-07, "loss": 1.465, "step": 1755 }, { "epoch": 0.9209837781266352, "grad_norm": 0.15372824338493402, "learning_rate": 1.8918381572049393e-07, "loss": 1.4512, "step": 1760 }, { "epoch": 0.923600209314495, "grad_norm": 0.16183345451480904, "learning_rate": 1.7693572252365841e-07, "loss": 1.4484, "step": 1765 }, { "epoch": 0.9262166405023547, "grad_norm": 0.16140526836471605, "learning_rate": 1.650903538060189e-07, "loss": 1.4727, "step": 1770 }, { "epoch": 0.9288330716902146, "grad_norm": 0.15802040978302784, "learning_rate": 1.536486986524538e-07, "loss": 1.4756, "step": 1775 }, { "epoch": 0.9314495028780743, "grad_norm": 0.15845763106711214, "learning_rate": 1.426117124378762e-07, "loss": 1.4562, "step": 1780 }, { "epoch": 0.9340659340659341, "grad_norm": 0.15827675471278949, "learning_rate": 1.3198031674745814e-07, "loss": 1.3972, "step": 1785 }, { "epoch": 0.9366823652537938, "grad_norm": 0.15820634911568737, "learning_rate": 1.2175539929968117e-07, "loss": 1.447, "step": 1790 }, { "epoch": 0.9392987964416536, "grad_norm": 0.15746447537785097, "learning_rate": 1.1193781387220936e-07, "loss": 1.4424, "step": 1795 }, { "epoch": 0.9419152276295133, "grad_norm": 0.15952729687882758, "learning_rate": 1.0252838023059985e-07, "loss": 1.4973, "step": 1800 }, { "epoch": 0.9445316588173731, "grad_norm": 0.16828930147773663, "learning_rate": 9.352788405985469e-08, "loss": 1.4413, "step": 1805 }, { "epoch": 0.9471480900052328, "grad_norm": 0.15376650859627036, "learning_rate": 8.493707689881448e-08, "loss": 1.4496, "step": 1810 }, { "epoch": 0.9497645211930926, "grad_norm": 0.15522464643971864, "learning_rate": 7.675667607740356e-08, "loss": 1.4641, "step": 1815 }, { "epoch": 0.9523809523809523, "grad_norm": 0.16498724213623525, "learning_rate": 6.898736465673739e-08, "loss": 1.4701, "step": 1820 }, { "epoch": 0.9549973835688121, "grad_norm": 0.16595904586188515, "learning_rate": 6.162979137208314e-08, "loss": 1.4684, "step": 1825 }, { "epoch": 0.957613814756672, "grad_norm": 0.15857597277888638, "learning_rate": 5.468457057869358e-08, "loss": 1.441, "step": 1830 }, { "epoch": 0.9602302459445317, "grad_norm": 0.1606833298581918, "learning_rate": 4.815228220050538e-08, "loss": 1.4476, "step": 1835 }, { "epoch": 0.9628466771323915, "grad_norm": 0.1589246468457863, "learning_rate": 4.2033471681718895e-08, "loss": 1.4504, "step": 1840 }, { "epoch": 0.9654631083202512, "grad_norm": 0.15710438180801464, "learning_rate": 3.632864994125129e-08, "loss": 1.4322, "step": 1845 }, { "epoch": 0.968079539508111, "grad_norm": 0.1582315990940597, "learning_rate": 3.103829333007624e-08, "loss": 1.4641, "step": 1850 }, { "epoch": 0.9706959706959707, "grad_norm": 0.1566511194352204, "learning_rate": 2.616284359144794e-08, "loss": 1.4106, "step": 1855 }, { "epoch": 0.9733124018838305, "grad_norm": 0.1628350239191456, "learning_rate": 2.1702707824017287e-08, "loss": 1.465, "step": 1860 }, { "epoch": 0.9759288330716902, "grad_norm": 0.15467515622764103, "learning_rate": 1.7658258447836306e-08, "loss": 1.4735, "step": 1865 }, { "epoch": 0.97854526425955, "grad_norm": 0.16105669593925756, "learning_rate": 1.4029833173264673e-08, "loss": 1.4625, "step": 1870 }, { "epoch": 0.9811616954474097, "grad_norm": 0.16002164928848112, "learning_rate": 1.0817734972768946e-08, "loss": 1.4441, "step": 1875 }, { "epoch": 0.9837781266352695, "grad_norm": 0.16816554801748468, "learning_rate": 8.022232055623913e-09, "loss": 1.4317, "step": 1880 }, { "epoch": 0.9863945578231292, "grad_norm": 0.1600747418511804, "learning_rate": 5.643557845518843e-09, "loss": 1.4828, "step": 1885 }, { "epoch": 0.989010989010989, "grad_norm": 0.15616538247681158, "learning_rate": 3.6819109610658486e-09, "loss": 1.4604, "step": 1890 }, { "epoch": 0.9916274201988488, "grad_norm": 0.15776227258618464, "learning_rate": 2.137455199215377e-09, "loss": 1.3947, "step": 1895 }, { "epoch": 0.9942438513867086, "grad_norm": 0.15889575650183468, "learning_rate": 1.0103195215788175e-09, "loss": 1.4492, "step": 1900 }, { "epoch": 0.9968602825745683, "grad_norm": 0.15556430774291685, "learning_rate": 3.005980436604494e-10, "loss": 1.4935, "step": 1905 }, { "epoch": 0.9994767137624281, "grad_norm": 0.15905073281412974, "learning_rate": 8.350027000392224e-12, "loss": 1.4804, "step": 1910 }, { "epoch": 1.0, "eval_runtime": 2.2534, "eval_samples_per_second": 4.438, "eval_steps_per_second": 1.331, "step": 1911 }, { "epoch": 1.0, "step": 1911, "total_flos": 4845354067427328.0, "train_loss": 1.4736063955717222, "train_runtime": 1883.6956, "train_samples_per_second": 16.224, "train_steps_per_second": 1.014 } ], "logging_steps": 5, "max_steps": 1911, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4845354067427328.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }