{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 774, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007751937984496124, "grad_norm": 2.3754148483276367, "learning_rate": 7.692307692307693e-07, "loss": 4.068140983581543, "step": 2 }, { "epoch": 0.015503875968992248, "grad_norm": 0.20276732742786407, "learning_rate": 2.307692307692308e-06, "loss": 2.0239908695220947, "step": 4 }, { "epoch": 0.023255813953488372, "grad_norm": 0.35163113474845886, "learning_rate": 3.846153846153846e-06, "loss": 1.9337211847305298, "step": 6 }, { "epoch": 0.031007751937984496, "grad_norm": 0.19401516020298004, "learning_rate": 5.384615384615385e-06, "loss": 1.9213242530822754, "step": 8 }, { "epoch": 0.03875968992248062, "grad_norm": 0.11387048661708832, "learning_rate": 6.923076923076923e-06, "loss": 2.1911349296569824, "step": 10 }, { "epoch": 0.046511627906976744, "grad_norm": 2.326526165008545, "learning_rate": 8.461538461538462e-06, "loss": 3.268449306488037, "step": 12 }, { "epoch": 0.05426356589147287, "grad_norm": 0.15703660249710083, "learning_rate": 9.999999999999999e-06, "loss": 1.7003194093704224, "step": 14 }, { "epoch": 0.06201550387596899, "grad_norm": 0.6381284594535828, "learning_rate": 1.153846153846154e-05, "loss": 1.8064090013504028, "step": 16 }, { "epoch": 0.06976744186046512, "grad_norm": 0.5508278608322144, "learning_rate": 1.3076923076923078e-05, "loss": 1.3964051008224487, "step": 18 }, { "epoch": 0.07751937984496124, "grad_norm": 0.3715132772922516, "learning_rate": 1.4615384615384615e-05, "loss": 1.59793221950531, "step": 20 }, { "epoch": 0.08527131782945736, "grad_norm": 7.088710308074951, "learning_rate": 1.6153846153846154e-05, "loss": 1.7358227968215942, "step": 22 }, { "epoch": 0.09302325581395349, "grad_norm": 0.14647279679775238, "learning_rate": 1.7692307692307694e-05, "loss": 1.3447600603103638, "step": 24 }, { "epoch": 0.10077519379844961, "grad_norm": 0.22108672559261322, "learning_rate": 1.923076923076923e-05, "loss": 1.4682307243347168, "step": 26 }, { "epoch": 0.10852713178294573, "grad_norm": 0.3697395622730255, "learning_rate": 2.076923076923077e-05, "loss": 1.2035093307495117, "step": 28 }, { "epoch": 0.11627906976744186, "grad_norm": 0.24884682893753052, "learning_rate": 2.230769230769231e-05, "loss": 1.1427452564239502, "step": 30 }, { "epoch": 0.12403100775193798, "grad_norm": 0.18956558406352997, "learning_rate": 2.3846153846153846e-05, "loss": 1.3711202144622803, "step": 32 }, { "epoch": 0.13178294573643412, "grad_norm": 0.18877695500850677, "learning_rate": 2.5384615384615386e-05, "loss": 1.2189266681671143, "step": 34 }, { "epoch": 0.13953488372093023, "grad_norm": 0.10388179123401642, "learning_rate": 2.6923076923076923e-05, "loss": 1.3252586126327515, "step": 36 }, { "epoch": 0.14728682170542637, "grad_norm": 0.2508637309074402, "learning_rate": 2.846153846153846e-05, "loss": 1.0033904314041138, "step": 38 }, { "epoch": 0.15503875968992248, "grad_norm": 0.09986624866724014, "learning_rate": 3e-05, "loss": 1.4535468816757202, "step": 40 }, { "epoch": 0.16279069767441862, "grad_norm": 0.12885728478431702, "learning_rate": 2.999580739494117e-05, "loss": 1.0325186252593994, "step": 42 }, { "epoch": 0.17054263565891473, "grad_norm": 0.10903146117925644, "learning_rate": 2.998323233708815e-05, "loss": 1.2467223405838013, "step": 44 }, { "epoch": 0.17829457364341086, "grad_norm": 0.11212802678346634, "learning_rate": 2.9962283096597995e-05, "loss": 1.6686618328094482, "step": 46 }, { "epoch": 0.18604651162790697, "grad_norm": 0.2613708972930908, "learning_rate": 2.9932973451022333e-05, "loss": 0.8405603170394897, "step": 48 }, { "epoch": 0.1937984496124031, "grad_norm": 0.8741084337234497, "learning_rate": 2.9895322676246387e-05, "loss": 0.6372175812721252, "step": 50 }, { "epoch": 0.20155038759689922, "grad_norm": 0.20064491033554077, "learning_rate": 2.9849355533811937e-05, "loss": 1.0768086910247803, "step": 52 }, { "epoch": 0.20930232558139536, "grad_norm": 0.20809470117092133, "learning_rate": 2.9795102254632528e-05, "loss": 0.58198082447052, "step": 54 }, { "epoch": 0.21705426356589147, "grad_norm": 0.10565000772476196, "learning_rate": 2.9732598519111736e-05, "loss": 1.3517603874206543, "step": 56 }, { "epoch": 0.2248062015503876, "grad_norm": 0.10898104310035706, "learning_rate": 2.9661885433677437e-05, "loss": 1.340335488319397, "step": 58 }, { "epoch": 0.23255813953488372, "grad_norm": 0.3277449309825897, "learning_rate": 2.9583009503747627e-05, "loss": 1.1451056003570557, "step": 60 }, { "epoch": 0.24031007751937986, "grad_norm": 0.11206506192684174, "learning_rate": 2.9496022603145497e-05, "loss": 1.2255440950393677, "step": 62 }, { "epoch": 0.24806201550387597, "grad_norm": 0.14122240245342255, "learning_rate": 2.940098193998391e-05, "loss": 1.2778782844543457, "step": 64 }, { "epoch": 0.2558139534883721, "grad_norm": 0.17153455317020416, "learning_rate": 2.9297950019041724e-05, "loss": 1.178369402885437, "step": 66 }, { "epoch": 0.26356589147286824, "grad_norm": 0.2940099239349365, "learning_rate": 2.918699460065665e-05, "loss": 1.1788100004196167, "step": 68 }, { "epoch": 0.2713178294573643, "grad_norm": 0.07703827321529388, "learning_rate": 2.906818865616178e-05, "loss": 1.306922435760498, "step": 70 }, { "epoch": 0.27906976744186046, "grad_norm": 0.24490903317928314, "learning_rate": 2.8941610319894977e-05, "loss": 1.0475130081176758, "step": 72 }, { "epoch": 0.2868217054263566, "grad_norm": 0.13828147947788239, "learning_rate": 2.8807342837812783e-05, "loss": 1.1680102348327637, "step": 74 }, { "epoch": 0.29457364341085274, "grad_norm": 0.13997578620910645, "learning_rate": 2.8665474512742603e-05, "loss": 1.0921390056610107, "step": 76 }, { "epoch": 0.3023255813953488, "grad_norm": 0.08565083891153336, "learning_rate": 2.8516098646309108e-05, "loss": 1.1694703102111816, "step": 78 }, { "epoch": 0.31007751937984496, "grad_norm": 0.054738063365221024, "learning_rate": 2.8359313477573215e-05, "loss": 1.1712660789489746, "step": 80 }, { "epoch": 0.3178294573643411, "grad_norm": 0.06600243598222733, "learning_rate": 2.8195222118423792e-05, "loss": 1.32106351852417, "step": 82 }, { "epoch": 0.32558139534883723, "grad_norm": 0.1856859177350998, "learning_rate": 2.8023932485764768e-05, "loss": 1.002191424369812, "step": 84 }, { "epoch": 0.3333333333333333, "grad_norm": 0.1050916314125061, "learning_rate": 2.7845557230542076e-05, "loss": 1.1279501914978027, "step": 86 }, { "epoch": 0.34108527131782945, "grad_norm": 0.09759809821844101, "learning_rate": 2.7660213663657282e-05, "loss": 1.3432408571243286, "step": 88 }, { "epoch": 0.3488372093023256, "grad_norm": 0.09217233955860138, "learning_rate": 2.7468023678816447e-05, "loss": 0.8359699249267578, "step": 90 }, { "epoch": 0.35658914728682173, "grad_norm": 0.11377432197332382, "learning_rate": 2.726911367236509e-05, "loss": 1.1406829357147217, "step": 92 }, { "epoch": 0.3643410852713178, "grad_norm": 0.14340269565582275, "learning_rate": 2.706361446016193e-05, "loss": 1.142421841621399, "step": 94 }, { "epoch": 0.37209302325581395, "grad_norm": 0.39369332790374756, "learning_rate": 2.6851661191546038e-05, "loss": 1.2204563617706299, "step": 96 }, { "epoch": 0.3798449612403101, "grad_norm": 0.1904468685388565, "learning_rate": 2.6633393260454096e-05, "loss": 0.7862477898597717, "step": 98 }, { "epoch": 0.3875968992248062, "grad_norm": 0.13884544372558594, "learning_rate": 2.6408954213746028e-05, "loss": 0.7346755862236023, "step": 100 }, { "epoch": 0.3953488372093023, "grad_norm": 0.07463762909173965, "learning_rate": 2.61784916567995e-05, "loss": 1.3203626871109009, "step": 102 }, { "epoch": 0.40310077519379844, "grad_norm": 0.1436045616865158, "learning_rate": 2.5942157156435248e-05, "loss": 1.2376055717468262, "step": 104 }, { "epoch": 0.4108527131782946, "grad_norm": 0.14889220893383026, "learning_rate": 2.570010614123707e-05, "loss": 1.0368235111236572, "step": 106 }, { "epoch": 0.4186046511627907, "grad_norm": 0.073515884578228, "learning_rate": 2.545249779933216e-05, "loss": 1.105363130569458, "step": 108 }, { "epoch": 0.4263565891472868, "grad_norm": 0.0721718966960907, "learning_rate": 2.5199494973698856e-05, "loss": 1.0211938619613647, "step": 110 }, { "epoch": 0.43410852713178294, "grad_norm": 0.08736886829137802, "learning_rate": 2.494126405507074e-05, "loss": 0.9343675971031189, "step": 112 }, { "epoch": 0.4418604651162791, "grad_norm": 0.6035546064376831, "learning_rate": 2.4677974872507553e-05, "loss": 1.0941760540008545, "step": 114 }, { "epoch": 0.4496124031007752, "grad_norm": 0.10358745604753494, "learning_rate": 2.440980058170478e-05, "loss": 1.0486119985580444, "step": 116 }, { "epoch": 0.4573643410852713, "grad_norm": 0.41432875394821167, "learning_rate": 2.4136917551115484e-05, "loss": 0.9473840594291687, "step": 118 }, { "epoch": 0.46511627906976744, "grad_norm": 0.06210324168205261, "learning_rate": 2.38595052459592e-05, "loss": 1.2813960313796997, "step": 120 }, { "epoch": 0.4728682170542636, "grad_norm": 0.21510902047157288, "learning_rate": 2.357774611019419e-05, "loss": 1.0586227178573608, "step": 122 }, { "epoch": 0.4806201550387597, "grad_norm": 0.10468325763940811, "learning_rate": 2.3291825446530736e-05, "loss": 1.2756110429763794, "step": 124 }, { "epoch": 0.4883720930232558, "grad_norm": 1.1782441139221191, "learning_rate": 2.3001931294564265e-05, "loss": 1.168853759765625, "step": 126 }, { "epoch": 0.49612403100775193, "grad_norm": 0.11016172915697098, "learning_rate": 2.27082543071086e-05, "loss": 1.181935429573059, "step": 128 }, { "epoch": 0.5038759689922481, "grad_norm": 0.6895468831062317, "learning_rate": 2.2410987624810524e-05, "loss": 1.1901732683181763, "step": 130 }, { "epoch": 0.5116279069767442, "grad_norm": 0.18288742005825043, "learning_rate": 2.2110326749128233e-05, "loss": 0.7289036512374878, "step": 132 }, { "epoch": 0.5193798449612403, "grad_norm": 0.11772674322128296, "learning_rate": 2.1806469413757164e-05, "loss": 1.161149024963379, "step": 134 }, { "epoch": 0.5271317829457365, "grad_norm": 0.2412514090538025, "learning_rate": 2.149961545458773e-05, "loss": 1.1283718347549438, "step": 136 }, { "epoch": 0.5348837209302325, "grad_norm": 0.08974076807498932, "learning_rate": 2.118996667828058e-05, "loss": 1.362121343612671, "step": 138 }, { "epoch": 0.5426356589147286, "grad_norm": 0.12378139048814774, "learning_rate": 2.0877726729545665e-05, "loss": 1.2608673572540283, "step": 140 }, { "epoch": 0.5503875968992248, "grad_norm": 0.41343384981155396, "learning_rate": 2.0563100957212577e-05, "loss": 0.5950201153755188, "step": 142 }, { "epoch": 0.5581395348837209, "grad_norm": 0.09795749187469482, "learning_rate": 2.0246296279180093e-05, "loss": 1.3639545440673828, "step": 144 }, { "epoch": 0.5658914728682171, "grad_norm": 0.09332104027271271, "learning_rate": 1.9927521046333833e-05, "loss": 1.0145015716552734, "step": 146 }, { "epoch": 0.5736434108527132, "grad_norm": 0.15143655240535736, "learning_rate": 1.960698490552145e-05, "loss": 0.9937471151351929, "step": 148 }, { "epoch": 0.5813953488372093, "grad_norm": 0.13751712441444397, "learning_rate": 1.9284898661675586e-05, "loss": 1.0032529830932617, "step": 150 }, { "epoch": 0.5891472868217055, "grad_norm": 0.0935468077659607, "learning_rate": 1.8961474139175106e-05, "loss": 1.2299753427505493, "step": 152 }, { "epoch": 0.5968992248062015, "grad_norm": 0.06415323913097382, "learning_rate": 1.863692404253597e-05, "loss": 1.2138370275497437, "step": 154 }, { "epoch": 0.6046511627906976, "grad_norm": 0.17963920533657074, "learning_rate": 1.8311461816523192e-05, "loss": 0.7944934964179993, "step": 156 }, { "epoch": 0.6124031007751938, "grad_norm": 0.13642992079257965, "learning_rate": 1.7985301505776026e-05, "loss": 0.8701238036155701, "step": 158 }, { "epoch": 0.6201550387596899, "grad_norm": 0.09022583067417145, "learning_rate": 1.765865761403861e-05, "loss": 1.279708981513977, "step": 160 }, { "epoch": 0.627906976744186, "grad_norm": 0.310406357049942, "learning_rate": 1.733174496308864e-05, "loss": 1.020676612854004, "step": 162 }, { "epoch": 0.6356589147286822, "grad_norm": 0.0761994794011116, "learning_rate": 1.700477855145699e-05, "loss": 1.2313765287399292, "step": 164 }, { "epoch": 0.6434108527131783, "grad_norm": 0.09753235429525375, "learning_rate": 1.6677973413030936e-05, "loss": 0.9673617482185364, "step": 166 }, { "epoch": 0.6511627906976745, "grad_norm": 0.16590853035449982, "learning_rate": 1.6351544475634266e-05, "loss": 1.194890022277832, "step": 168 }, { "epoch": 0.6589147286821705, "grad_norm": 0.14948436617851257, "learning_rate": 1.6025706419677057e-05, "loss": 0.5818596482276917, "step": 170 }, { "epoch": 0.6666666666666666, "grad_norm": 0.1858793944120407, "learning_rate": 1.5700673536968222e-05, "loss": 1.1378095149993896, "step": 172 }, { "epoch": 0.6744186046511628, "grad_norm": 0.2438780963420868, "learning_rate": 1.5376659589783572e-05, "loss": 0.864031970500946, "step": 174 }, { "epoch": 0.6821705426356589, "grad_norm": 0.13808684051036835, "learning_rate": 1.5053877670282186e-05, "loss": 0.9113052487373352, "step": 176 }, { "epoch": 0.689922480620155, "grad_norm": 0.211846262216568, "learning_rate": 1.4732540060363447e-05, "loss": 0.9309589862823486, "step": 178 }, { "epoch": 0.6976744186046512, "grad_norm": 0.13679386675357819, "learning_rate": 1.4412858092056991e-05, "loss": 1.002301573753357, "step": 180 }, { "epoch": 0.7054263565891473, "grad_norm": 0.09943073987960815, "learning_rate": 1.4095042008537343e-05, "loss": 1.0712729692459106, "step": 182 }, { "epoch": 0.7131782945736435, "grad_norm": 0.18878647685050964, "learning_rate": 1.3779300825854622e-05, "loss": 0.9123468995094299, "step": 184 }, { "epoch": 0.7209302325581395, "grad_norm": 0.08719319850206375, "learning_rate": 1.3465842195472321e-05, "loss": 1.2733235359191895, "step": 186 }, { "epoch": 0.7286821705426356, "grad_norm": 0.06448693573474884, "learning_rate": 1.3154872267702522e-05, "loss": 0.9789453148841858, "step": 188 }, { "epoch": 0.7364341085271318, "grad_norm": 0.12241167575120926, "learning_rate": 1.2846595556128331e-05, "loss": 1.0140795707702637, "step": 190 }, { "epoch": 0.7441860465116279, "grad_norm": 0.14142774045467377, "learning_rate": 1.254121480310276e-05, "loss": 1.1332778930664062, "step": 192 }, { "epoch": 0.751937984496124, "grad_norm": 0.07337574660778046, "learning_rate": 1.2238930846412475e-05, "loss": 1.201830506324768, "step": 194 }, { "epoch": 0.7596899224806202, "grad_norm": 0.07899358868598938, "learning_rate": 1.1939942487194116e-05, "loss": 1.2011100053787231, "step": 196 }, { "epoch": 0.7674418604651163, "grad_norm": 0.10521137714385986, "learning_rate": 1.1644446359190004e-05, "loss": 0.5936653017997742, "step": 198 }, { "epoch": 0.7751937984496124, "grad_norm": 0.16837139427661896, "learning_rate": 1.1352636799429354e-05, "loss": 1.3216241598129272, "step": 200 }, { "epoch": 0.7829457364341085, "grad_norm": 0.11404802650213242, "learning_rate": 1.1064705720419829e-05, "loss": 1.084835171699524, "step": 202 }, { "epoch": 0.7906976744186046, "grad_norm": 0.24780981242656708, "learning_rate": 1.0780842483933755e-05, "loss": 1.2125266790390015, "step": 204 }, { "epoch": 0.7984496124031008, "grad_norm": 0.12619031965732574, "learning_rate": 1.050123377647171e-05, "loss": 1.0225963592529297, "step": 206 }, { "epoch": 0.8062015503875969, "grad_norm": 1.412670612335205, "learning_rate": 1.0226063486485695e-05, "loss": 0.7963980436325073, "step": 208 }, { "epoch": 0.813953488372093, "grad_norm": 0.18459799885749817, "learning_rate": 9.955512583442334e-06, "loss": 1.2788116931915283, "step": 210 }, { "epoch": 0.8217054263565892, "grad_norm": 0.058253731578588486, "learning_rate": 9.68975899880592e-06, "loss": 1.1842073202133179, "step": 212 }, { "epoch": 0.8294573643410853, "grad_norm": 0.09324084967374802, "learning_rate": 9.42897750901933e-06, "loss": 0.9420091509819031, "step": 214 }, { "epoch": 0.8372093023255814, "grad_norm": 0.14589789509773254, "learning_rate": 9.173339620559935e-06, "loss": 1.0436409711837769, "step": 216 }, { "epoch": 0.8449612403100775, "grad_norm": 0.08236993849277496, "learning_rate": 8.923013457146082e-06, "loss": 1.2834446430206299, "step": 218 }, { "epoch": 0.8527131782945736, "grad_norm": 0.07797209173440933, "learning_rate": 8.678163649168214e-06, "loss": 1.1693506240844727, "step": 220 }, { "epoch": 0.8604651162790697, "grad_norm": 0.21979407966136932, "learning_rate": 8.438951225417476e-06, "loss": 0.49415066838264465, "step": 222 }, { "epoch": 0.8682170542635659, "grad_norm": 0.16792796552181244, "learning_rate": 8.205533507182963e-06, "loss": 1.1654852628707886, "step": 224 }, { "epoch": 0.875968992248062, "grad_norm": 0.1074092760682106, "learning_rate": 7.978064004787238e-06, "loss": 1.2648242712020874, "step": 226 }, { "epoch": 0.8837209302325582, "grad_norm": 0.12686721980571747, "learning_rate": 7.756692316628162e-06, "loss": 0.8766679167747498, "step": 228 }, { "epoch": 0.8914728682170543, "grad_norm": 0.10413216799497604, "learning_rate": 7.541564030793536e-06, "loss": 0.9922328591346741, "step": 230 }, { "epoch": 0.8992248062015504, "grad_norm": 0.07999309152364731, "learning_rate": 7.33282062931308e-06, "loss": 0.837881863117218, "step": 232 }, { "epoch": 0.9069767441860465, "grad_norm": 0.16637900471687317, "learning_rate": 7.13059939511089e-06, "loss": 1.272527813911438, "step": 234 }, { "epoch": 0.9147286821705426, "grad_norm": 0.13920988142490387, "learning_rate": 6.935033321719421e-06, "loss": 0.6637862920761108, "step": 236 }, { "epoch": 0.9224806201550387, "grad_norm": 0.07921171188354492, "learning_rate": 6.746251025814548e-06, "loss": 1.2028839588165283, "step": 238 }, { "epoch": 0.9302325581395349, "grad_norm": 0.11715266853570938, "learning_rate": 6.564376662629032e-06, "loss": 1.0310890674591064, "step": 240 }, { "epoch": 0.937984496124031, "grad_norm": 0.1706083118915558, "learning_rate": 6.389529844300147e-06, "loss": 1.129476547241211, "step": 242 }, { "epoch": 0.9457364341085271, "grad_norm": 0.09015638381242752, "learning_rate": 6.2218255612051575e-06, "loss": 0.9788402915000916, "step": 244 }, { "epoch": 0.9534883720930233, "grad_norm": 0.09626635164022446, "learning_rate": 6.061374106336329e-06, "loss": 0.7472362518310547, "step": 246 }, { "epoch": 0.9612403100775194, "grad_norm": 0.17239803075790405, "learning_rate": 5.9082810027652495e-06, "loss": 0.7408154606819153, "step": 248 }, { "epoch": 0.9689922480620154, "grad_norm": 0.07973187416791916, "learning_rate": 5.762646934244157e-06, "loss": 1.1912089586257935, "step": 250 }, { "epoch": 0.9767441860465116, "grad_norm": 0.08109164237976074, "learning_rate": 5.6245676789899e-06, "loss": 0.970727264881134, "step": 252 }, { "epoch": 0.9844961240310077, "grad_norm": 0.2656784951686859, "learning_rate": 5.494134046694101e-06, "loss": 0.9474197626113892, "step": 254 }, { "epoch": 0.9922480620155039, "grad_norm": 0.09388367086648941, "learning_rate": 5.371431818800934e-06, "loss": 0.7675265073776245, "step": 256 }, { "epoch": 1.0, "grad_norm": 0.07208788394927979, "learning_rate": 5.256541692091799e-06, "loss": 1.151860237121582, "step": 258 }, { "epoch": 1.0077519379844961, "grad_norm": 0.07177931815385818, "learning_rate": 5.149539225613974e-06, "loss": 0.6956380605697632, "step": 260 }, { "epoch": 1.0155038759689923, "grad_norm": 0.06252402067184448, "learning_rate": 5.050494790988212e-06, "loss": 0.9135383367538452, "step": 262 }, { "epoch": 1.0232558139534884, "grad_norm": 0.17128507792949677, "learning_rate": 4.95947352612787e-06, "loss": 0.721315324306488, "step": 264 }, { "epoch": 1.0310077519379846, "grad_norm": 0.08467314392328262, "learning_rate": 4.876535292400089e-06, "loss": 0.4410458207130432, "step": 266 }, { "epoch": 1.0387596899224807, "grad_norm": 0.10766004770994186, "learning_rate": 4.801734635257148e-06, "loss": 0.8536827564239502, "step": 268 }, { "epoch": 1.0465116279069768, "grad_norm": 0.15451736748218536, "learning_rate": 4.735120748363916e-06, "loss": 0.903506875038147, "step": 270 }, { "epoch": 1.054263565891473, "grad_norm": 0.06612464040517807, "learning_rate": 4.676737441244975e-06, "loss": 0.48186248540878296, "step": 272 }, { "epoch": 1.062015503875969, "grad_norm": 0.10002221167087555, "learning_rate": 4.626623110472677e-06, "loss": 0.8960871696472168, "step": 274 }, { "epoch": 1.069767441860465, "grad_norm": 0.07858562469482422, "learning_rate": 4.584810714415135e-06, "loss": 0.8507243990898132, "step": 276 }, { "epoch": 1.0775193798449612, "grad_norm": 0.06665261089801788, "learning_rate": 4.5513277515607014e-06, "loss": 0.9197998642921448, "step": 278 }, { "epoch": 1.0852713178294573, "grad_norm": 0.16345328092575073, "learning_rate": 4.526196242433211e-06, "loss": 0.778313398361206, "step": 280 }, { "epoch": 1.0930232558139534, "grad_norm": 0.10489022731781006, "learning_rate": 4.509432715109887e-06, "loss": 0.5479567050933838, "step": 282 }, { "epoch": 1.1007751937984496, "grad_norm": 0.05080074071884155, "learning_rate": 4.50104819435143e-06, "loss": 0.6334800720214844, "step": 284 }, { "epoch": 1.1085271317829457, "grad_norm": 0.12185381352901459, "learning_rate": 4.50104819435143e-06, "loss": 0.8215212225914001, "step": 286 }, { "epoch": 1.1162790697674418, "grad_norm": 0.08784171938896179, "learning_rate": 4.509432715109887e-06, "loss": 0.5245926976203918, "step": 288 }, { "epoch": 1.124031007751938, "grad_norm": 0.09065528959035873, "learning_rate": 4.526196242433211e-06, "loss": 1.0330955982208252, "step": 290 }, { "epoch": 1.1317829457364341, "grad_norm": 0.04104357957839966, "learning_rate": 4.5513277515607014e-06, "loss": 0.5526050329208374, "step": 292 }, { "epoch": 1.1395348837209303, "grad_norm": 0.31155890226364136, "learning_rate": 4.584810714415136e-06, "loss": 1.046125888824463, "step": 294 }, { "epoch": 1.1472868217054264, "grad_norm": 0.15053009986877441, "learning_rate": 4.626623110472676e-06, "loss": 0.3840217590332031, "step": 296 }, { "epoch": 1.1550387596899225, "grad_norm": 0.08694499731063843, "learning_rate": 4.676737441244973e-06, "loss": 0.6799867153167725, "step": 298 }, { "epoch": 1.1627906976744187, "grad_norm": 0.07247356325387955, "learning_rate": 4.735120748363917e-06, "loss": 0.6748986840248108, "step": 300 }, { "epoch": 1.1705426356589148, "grad_norm": 0.06139397993683815, "learning_rate": 4.801734635257148e-06, "loss": 0.8421810865402222, "step": 302 }, { "epoch": 1.178294573643411, "grad_norm": 0.08629012107849121, "learning_rate": 4.876535292400087e-06, "loss": 0.5402819514274597, "step": 304 }, { "epoch": 1.1860465116279069, "grad_norm": 0.17801746726036072, "learning_rate": 4.95947352612787e-06, "loss": 0.9019787311553955, "step": 306 }, { "epoch": 1.193798449612403, "grad_norm": 0.13845831155776978, "learning_rate": 5.050494790988212e-06, "loss": 0.8330530524253845, "step": 308 }, { "epoch": 1.2015503875968991, "grad_norm": 0.0652163103222847, "learning_rate": 5.149539225613974e-06, "loss": 1.0060863494873047, "step": 310 }, { "epoch": 1.2093023255813953, "grad_norm": 0.0937967598438263, "learning_rate": 5.256541692091797e-06, "loss": 0.5403499007225037, "step": 312 }, { "epoch": 1.2170542635658914, "grad_norm": 0.19726139307022095, "learning_rate": 5.371431818800936e-06, "loss": 0.37406668066978455, "step": 314 }, { "epoch": 1.2248062015503876, "grad_norm": 0.11253905296325684, "learning_rate": 5.494134046694099e-06, "loss": 0.6960604786872864, "step": 316 }, { "epoch": 1.2325581395348837, "grad_norm": 0.08688368648290634, "learning_rate": 5.624567678989899e-06, "loss": 0.7832977771759033, "step": 318 }, { "epoch": 1.2403100775193798, "grad_norm": 0.19534340500831604, "learning_rate": 5.762646934244156e-06, "loss": 0.9501113295555115, "step": 320 }, { "epoch": 1.248062015503876, "grad_norm": 0.06447340548038483, "learning_rate": 5.908281002765248e-06, "loss": 1.0130536556243896, "step": 322 }, { "epoch": 1.255813953488372, "grad_norm": 0.11461887508630753, "learning_rate": 6.061374106336328e-06, "loss": 0.631900429725647, "step": 324 }, { "epoch": 1.2635658914728682, "grad_norm": 0.09350797533988953, "learning_rate": 6.2218255612051575e-06, "loss": 0.8754401803016663, "step": 326 }, { "epoch": 1.2713178294573644, "grad_norm": 0.11175557225942612, "learning_rate": 6.389529844300143e-06, "loss": 0.7127947807312012, "step": 328 }, { "epoch": 1.2790697674418605, "grad_norm": 0.09055038541555405, "learning_rate": 6.564376662629029e-06, "loss": 0.4656026363372803, "step": 330 }, { "epoch": 1.2868217054263567, "grad_norm": 0.09712733328342438, "learning_rate": 6.74625102581455e-06, "loss": 0.8079378008842468, "step": 332 }, { "epoch": 1.2945736434108528, "grad_norm": 0.18206307291984558, "learning_rate": 6.935033321719419e-06, "loss": 0.5637804865837097, "step": 334 }, { "epoch": 1.302325581395349, "grad_norm": 0.23368722200393677, "learning_rate": 7.130599395110884e-06, "loss": 0.8007771968841553, "step": 336 }, { "epoch": 1.310077519379845, "grad_norm": 0.05224426090717316, "learning_rate": 7.332820629313082e-06, "loss": 0.551106333732605, "step": 338 }, { "epoch": 1.3178294573643412, "grad_norm": 0.07984264940023422, "learning_rate": 7.541564030793533e-06, "loss": 0.7754759788513184, "step": 340 }, { "epoch": 1.3255813953488373, "grad_norm": 0.22976501286029816, "learning_rate": 7.75669231662816e-06, "loss": 0.7786872982978821, "step": 342 }, { "epoch": 1.3333333333333333, "grad_norm": 0.17023955285549164, "learning_rate": 7.978064004787231e-06, "loss": 0.7895460724830627, "step": 344 }, { "epoch": 1.3410852713178294, "grad_norm": 0.12108391523361206, "learning_rate": 8.205533507182961e-06, "loss": 0.20940443873405457, "step": 346 }, { "epoch": 1.3488372093023255, "grad_norm": 0.07635517418384552, "learning_rate": 8.438951225417474e-06, "loss": 0.819771409034729, "step": 348 }, { "epoch": 1.3565891472868217, "grad_norm": 0.11260077357292175, "learning_rate": 8.678163649168212e-06, "loss": 0.9801982641220093, "step": 350 }, { "epoch": 1.3643410852713178, "grad_norm": 0.09885291010141373, "learning_rate": 8.923013457146075e-06, "loss": 0.7718797326087952, "step": 352 }, { "epoch": 1.372093023255814, "grad_norm": 0.09329655021429062, "learning_rate": 9.173339620559931e-06, "loss": 0.40787971019744873, "step": 354 }, { "epoch": 1.37984496124031, "grad_norm": 0.11724522709846497, "learning_rate": 9.428977509019326e-06, "loss": 0.797160804271698, "step": 356 }, { "epoch": 1.3875968992248062, "grad_norm": 0.11735495924949646, "learning_rate": 9.689758998805924e-06, "loss": 0.6483190059661865, "step": 358 }, { "epoch": 1.3953488372093024, "grad_norm": 0.08914632350206375, "learning_rate": 9.955512583442333e-06, "loss": 0.7835768461227417, "step": 360 }, { "epoch": 1.4031007751937985, "grad_norm": 0.07666268944740295, "learning_rate": 1.0226063486485691e-05, "loss": 0.6386092901229858, "step": 362 }, { "epoch": 1.4108527131782946, "grad_norm": 0.08281254768371582, "learning_rate": 1.0501233776471714e-05, "loss": 0.8520874977111816, "step": 364 }, { "epoch": 1.4186046511627908, "grad_norm": 0.14842084050178528, "learning_rate": 1.0780842483933755e-05, "loss": 0.37374499440193176, "step": 366 }, { "epoch": 1.4263565891472867, "grad_norm": 0.24841120839118958, "learning_rate": 1.1064705720419827e-05, "loss": 0.3320968449115753, "step": 368 }, { "epoch": 1.4341085271317828, "grad_norm": 0.11581484228372574, "learning_rate": 1.135263679942935e-05, "loss": 0.7746375799179077, "step": 370 }, { "epoch": 1.441860465116279, "grad_norm": 0.0945417657494545, "learning_rate": 1.1644446359190006e-05, "loss": 0.6704602241516113, "step": 372 }, { "epoch": 1.449612403100775, "grad_norm": 0.06997057050466537, "learning_rate": 1.1939942487194116e-05, "loss": 0.9213350415229797, "step": 374 }, { "epoch": 1.4573643410852712, "grad_norm": 0.07435750216245651, "learning_rate": 1.2238930846412471e-05, "loss": 0.7233853936195374, "step": 376 }, { "epoch": 1.4651162790697674, "grad_norm": 0.18093754351139069, "learning_rate": 1.2541214803102757e-05, "loss": 0.5185383558273315, "step": 378 }, { "epoch": 1.4728682170542635, "grad_norm": 0.052637044340372086, "learning_rate": 1.2846595556128331e-05, "loss": 0.7751470804214478, "step": 380 }, { "epoch": 1.4806201550387597, "grad_norm": 0.10150747746229172, "learning_rate": 1.3154872267702518e-05, "loss": 0.7363438010215759, "step": 382 }, { "epoch": 1.4883720930232558, "grad_norm": 0.08896318078041077, "learning_rate": 1.3465842195472318e-05, "loss": 0.697909951210022, "step": 384 }, { "epoch": 1.496124031007752, "grad_norm": 0.09349460154771805, "learning_rate": 1.3779300825854622e-05, "loss": 0.5058455467224121, "step": 386 }, { "epoch": 1.503875968992248, "grad_norm": 0.0640454888343811, "learning_rate": 1.4095042008537336e-05, "loss": 0.6899944543838501, "step": 388 }, { "epoch": 1.5116279069767442, "grad_norm": 0.08342494815587997, "learning_rate": 1.4412858092056988e-05, "loss": 0.5844802856445312, "step": 390 }, { "epoch": 1.5193798449612403, "grad_norm": 0.14086060225963593, "learning_rate": 1.4732540060363447e-05, "loss": 0.6977730393409729, "step": 392 }, { "epoch": 1.5271317829457365, "grad_norm": 0.135100856423378, "learning_rate": 1.5053877670282176e-05, "loss": 0.7261441349983215, "step": 394 }, { "epoch": 1.5348837209302326, "grad_norm": 0.1089802235364914, "learning_rate": 1.537665958978357e-05, "loss": 0.7607800960540771, "step": 396 }, { "epoch": 1.5426356589147288, "grad_norm": 0.17686955630779266, "learning_rate": 1.5700673536968222e-05, "loss": 0.5964785218238831, "step": 398 }, { "epoch": 1.550387596899225, "grad_norm": 0.06133165583014488, "learning_rate": 1.6025706419677047e-05, "loss": 0.7581831812858582, "step": 400 }, { "epoch": 1.558139534883721, "grad_norm": 0.11867906898260117, "learning_rate": 1.6351544475634256e-05, "loss": 0.5359363555908203, "step": 402 }, { "epoch": 1.5658914728682172, "grad_norm": 0.08171830326318741, "learning_rate": 1.6677973413030932e-05, "loss": 0.9142735004425049, "step": 404 }, { "epoch": 1.5736434108527133, "grad_norm": 0.11325247585773468, "learning_rate": 1.7004778551456975e-05, "loss": 0.7637568712234497, "step": 406 }, { "epoch": 1.5813953488372094, "grad_norm": 0.054144054651260376, "learning_rate": 1.7331744963088644e-05, "loss": 0.31641456484794617, "step": 408 }, { "epoch": 1.5891472868217056, "grad_norm": 0.051439665257930756, "learning_rate": 1.7658657614038598e-05, "loss": 0.780099630355835, "step": 410 }, { "epoch": 1.5968992248062015, "grad_norm": 0.07669004052877426, "learning_rate": 1.7985301505776015e-05, "loss": 0.7998414635658264, "step": 412 }, { "epoch": 1.6046511627906976, "grad_norm": 0.08620447665452957, "learning_rate": 1.8311461816523192e-05, "loss": 0.5864279866218567, "step": 414 }, { "epoch": 1.6124031007751938, "grad_norm": 0.0925377830862999, "learning_rate": 1.8636924042535962e-05, "loss": 0.47105392813682556, "step": 416 }, { "epoch": 1.62015503875969, "grad_norm": 0.08717449009418488, "learning_rate": 1.8961474139175093e-05, "loss": 0.8024092316627502, "step": 418 }, { "epoch": 1.627906976744186, "grad_norm": 0.12033627182245255, "learning_rate": 1.9284898661675586e-05, "loss": 0.810451090335846, "step": 420 }, { "epoch": 1.6356589147286822, "grad_norm": 0.07522077113389969, "learning_rate": 1.9606984905521443e-05, "loss": 0.4688906967639923, "step": 422 }, { "epoch": 1.6434108527131783, "grad_norm": 0.07208564877510071, "learning_rate": 1.9927521046333837e-05, "loss": 0.7383279204368591, "step": 424 }, { "epoch": 1.6511627906976745, "grad_norm": 0.09510686248540878, "learning_rate": 2.0246296279180093e-05, "loss": 0.8395543694496155, "step": 426 }, { "epoch": 1.6589147286821704, "grad_norm": 0.15472382307052612, "learning_rate": 2.0563100957212567e-05, "loss": 0.8986775875091553, "step": 428 }, { "epoch": 1.6666666666666665, "grad_norm": 0.09020368754863739, "learning_rate": 2.0877726729545672e-05, "loss": 0.8169777393341064, "step": 430 }, { "epoch": 1.6744186046511627, "grad_norm": 0.198333740234375, "learning_rate": 2.1189966678280578e-05, "loss": 1.033119559288025, "step": 432 }, { "epoch": 1.6821705426356588, "grad_norm": 0.08684570342302322, "learning_rate": 2.149961545458772e-05, "loss": 0.5892492532730103, "step": 434 }, { "epoch": 1.689922480620155, "grad_norm": 0.0764966830611229, "learning_rate": 2.1806469413757164e-05, "loss": 0.7995302081108093, "step": 436 }, { "epoch": 1.697674418604651, "grad_norm": 0.13916683197021484, "learning_rate": 2.211032674912823e-05, "loss": 0.8415105938911438, "step": 438 }, { "epoch": 1.7054263565891472, "grad_norm": 0.24585378170013428, "learning_rate": 2.241098762481052e-05, "loss": 0.6350277066230774, "step": 440 }, { "epoch": 1.7131782945736433, "grad_norm": 0.050845544785261154, "learning_rate": 2.27082543071086e-05, "loss": 0.8463593125343323, "step": 442 }, { "epoch": 1.7209302325581395, "grad_norm": 0.07698489725589752, "learning_rate": 2.3001931294564265e-05, "loss": 0.5609403252601624, "step": 444 }, { "epoch": 1.7286821705426356, "grad_norm": 0.06638149172067642, "learning_rate": 2.3291825446530733e-05, "loss": 0.8690592050552368, "step": 446 }, { "epoch": 1.7364341085271318, "grad_norm": 0.08811336010694504, "learning_rate": 2.357774611019419e-05, "loss": 0.8064720630645752, "step": 448 }, { "epoch": 1.744186046511628, "grad_norm": 0.0755743682384491, "learning_rate": 2.385950524595919e-05, "loss": 1.0067108869552612, "step": 450 }, { "epoch": 1.751937984496124, "grad_norm": 0.06093823164701462, "learning_rate": 2.4136917551115478e-05, "loss": 0.967079222202301, "step": 452 }, { "epoch": 1.7596899224806202, "grad_norm": 0.09034255892038345, "learning_rate": 2.4409800581704777e-05, "loss": 0.6444424986839294, "step": 454 }, { "epoch": 1.7674418604651163, "grad_norm": 0.1733829230070114, "learning_rate": 2.4677974872507553e-05, "loss": 0.8322298526763916, "step": 456 }, { "epoch": 1.7751937984496124, "grad_norm": 0.23445071280002594, "learning_rate": 2.4941264055070734e-05, "loss": 0.4230212867259979, "step": 458 }, { "epoch": 1.7829457364341086, "grad_norm": 0.1249038353562355, "learning_rate": 2.5199494973698852e-05, "loss": 0.6065483093261719, "step": 460 }, { "epoch": 1.7906976744186047, "grad_norm": 0.08323405683040619, "learning_rate": 2.545249779933216e-05, "loss": 0.8183580040931702, "step": 462 }, { "epoch": 1.7984496124031009, "grad_norm": 0.10287293046712875, "learning_rate": 2.5700106141237063e-05, "loss": 0.9282822608947754, "step": 464 }, { "epoch": 1.806201550387597, "grad_norm": 0.053924717009067535, "learning_rate": 2.594215715643524e-05, "loss": 0.8734548687934875, "step": 466 }, { "epoch": 1.8139534883720931, "grad_norm": 0.10388979315757751, "learning_rate": 2.6178491656799497e-05, "loss": 0.8903089165687561, "step": 468 }, { "epoch": 1.8217054263565893, "grad_norm": 0.06755795329809189, "learning_rate": 2.640895421374602e-05, "loss": 0.4710087180137634, "step": 470 }, { "epoch": 1.8294573643410854, "grad_norm": 0.08703745901584625, "learning_rate": 2.6633393260454096e-05, "loss": 1.1290743350982666, "step": 472 }, { "epoch": 1.8372093023255816, "grad_norm": 0.10183677822351456, "learning_rate": 2.6851661191546034e-05, "loss": 0.6608400344848633, "step": 474 }, { "epoch": 1.8449612403100775, "grad_norm": 0.11454630643129349, "learning_rate": 2.706361446016192e-05, "loss": 0.850265383720398, "step": 476 }, { "epoch": 1.8527131782945736, "grad_norm": 0.06782646477222443, "learning_rate": 2.7269113672365096e-05, "loss": 0.6361703872680664, "step": 478 }, { "epoch": 1.8604651162790697, "grad_norm": 0.08559778332710266, "learning_rate": 2.7468023678816444e-05, "loss": 1.0639129877090454, "step": 480 }, { "epoch": 1.8682170542635659, "grad_norm": 0.06762553006410599, "learning_rate": 2.766021366365728e-05, "loss": 0.6422796845436096, "step": 482 }, { "epoch": 1.875968992248062, "grad_norm": 0.07438317686319351, "learning_rate": 2.784555723054208e-05, "loss": 0.7208263874053955, "step": 484 }, { "epoch": 1.8837209302325582, "grad_norm": 0.07318796217441559, "learning_rate": 2.8023932485764764e-05, "loss": 0.8420804738998413, "step": 486 }, { "epoch": 1.8914728682170543, "grad_norm": 0.10379486531019211, "learning_rate": 2.81952221184238e-05, "loss": 0.5533670783042908, "step": 488 }, { "epoch": 1.8992248062015504, "grad_norm": 1.0894800424575806, "learning_rate": 2.8359313477573215e-05, "loss": 0.688605785369873, "step": 490 }, { "epoch": 1.9069767441860463, "grad_norm": 0.23758739233016968, "learning_rate": 2.8516098646309108e-05, "loss": 0.5789573192596436, "step": 492 }, { "epoch": 1.9147286821705425, "grad_norm": 0.06857667863368988, "learning_rate": 2.8665474512742607e-05, "loss": 0.6448074579238892, "step": 494 }, { "epoch": 1.9224806201550386, "grad_norm": 0.08650626242160797, "learning_rate": 2.8807342837812783e-05, "loss": 0.6479641199111938, "step": 496 }, { "epoch": 1.9302325581395348, "grad_norm": 0.07275024801492691, "learning_rate": 2.894161031989497e-05, "loss": 0.4521400034427643, "step": 498 }, { "epoch": 1.937984496124031, "grad_norm": 0.05953352525830269, "learning_rate": 2.906818865616178e-05, "loss": 0.9132779240608215, "step": 500 }, { "epoch": 1.945736434108527, "grad_norm": 0.12861226499080658, "learning_rate": 2.9186994600656647e-05, "loss": 0.6908618807792664, "step": 502 }, { "epoch": 1.9534883720930232, "grad_norm": 0.07091208547353745, "learning_rate": 2.929795001904172e-05, "loss": 0.6676538586616516, "step": 504 }, { "epoch": 1.9612403100775193, "grad_norm": 0.11093394458293915, "learning_rate": 2.9400981939983914e-05, "loss": 1.0052788257598877, "step": 506 }, { "epoch": 1.9689922480620154, "grad_norm": 0.05772824585437775, "learning_rate": 2.9496022603145494e-05, "loss": 0.7913935780525208, "step": 508 }, { "epoch": 1.9767441860465116, "grad_norm": 0.0762370154261589, "learning_rate": 2.9583009503747627e-05, "loss": 0.9280475974082947, "step": 510 }, { "epoch": 1.9844961240310077, "grad_norm": 0.18662315607070923, "learning_rate": 2.9661885433677437e-05, "loss": 0.7493736743927002, "step": 512 }, { "epoch": 1.9922480620155039, "grad_norm": 0.07221183180809021, "learning_rate": 2.9732598519111736e-05, "loss": 1.0501880645751953, "step": 514 }, { "epoch": 2.0, "grad_norm": 0.08491652458906174, "learning_rate": 2.9795102254632528e-05, "loss": 1.011595368385315, "step": 516 }, { "epoch": 2.007751937984496, "grad_norm": 0.09373293071985245, "learning_rate": 2.9849355533811937e-05, "loss": 0.5705936551094055, "step": 518 }, { "epoch": 2.0155038759689923, "grad_norm": 0.06463813781738281, "learning_rate": 2.9895322676246387e-05, "loss": 0.7379302978515625, "step": 520 }, { "epoch": 2.0232558139534884, "grad_norm": 0.09566348791122437, "learning_rate": 2.993297345102233e-05, "loss": 0.46209296584129333, "step": 522 }, { "epoch": 2.0310077519379846, "grad_norm": 0.05616720765829086, "learning_rate": 2.9962283096597995e-05, "loss": 0.773676335811615, "step": 524 }, { "epoch": 2.0387596899224807, "grad_norm": 0.09464262425899506, "learning_rate": 2.998323233708815e-05, "loss": 0.6592158675193787, "step": 526 }, { "epoch": 2.046511627906977, "grad_norm": 0.09258489310741425, "learning_rate": 2.999580739494117e-05, "loss": 0.7777129411697388, "step": 528 }, { "epoch": 2.054263565891473, "grad_norm": 0.13635995984077454, "learning_rate": 3e-05, "loss": 0.385895311832428, "step": 530 }, { "epoch": 2.062015503875969, "grad_norm": 0.1054837629199028, "learning_rate": 2.999580739494117e-05, "loss": 0.7748541235923767, "step": 532 }, { "epoch": 2.0697674418604652, "grad_norm": 0.08860507607460022, "learning_rate": 2.998323233708815e-05, "loss": 0.407875120639801, "step": 534 }, { "epoch": 2.0775193798449614, "grad_norm": 0.07644882053136826, "learning_rate": 2.9962283096598e-05, "loss": 0.41405466198921204, "step": 536 }, { "epoch": 2.0852713178294575, "grad_norm": 0.20681916177272797, "learning_rate": 2.9932973451022333e-05, "loss": 0.701027512550354, "step": 538 }, { "epoch": 2.0930232558139537, "grad_norm": 0.07310563325881958, "learning_rate": 2.9895322676246387e-05, "loss": 0.4735100567340851, "step": 540 }, { "epoch": 2.10077519379845, "grad_norm": 0.0755162462592125, "learning_rate": 2.9849355533811937e-05, "loss": 0.27081194519996643, "step": 542 }, { "epoch": 2.108527131782946, "grad_norm": 0.07929737865924835, "learning_rate": 2.9795102254632528e-05, "loss": 0.6002092957496643, "step": 544 }, { "epoch": 2.116279069767442, "grad_norm": 0.25740522146224976, "learning_rate": 2.973259851911174e-05, "loss": 0.4636404514312744, "step": 546 }, { "epoch": 2.124031007751938, "grad_norm": 0.07688764482736588, "learning_rate": 2.9661885433677434e-05, "loss": 0.4923861026763916, "step": 548 }, { "epoch": 2.1317829457364343, "grad_norm": 0.24001885950565338, "learning_rate": 2.9583009503747627e-05, "loss": 0.3250856101512909, "step": 550 }, { "epoch": 2.13953488372093, "grad_norm": 0.09132993221282959, "learning_rate": 2.9496022603145497e-05, "loss": 0.7897784113883972, "step": 552 }, { "epoch": 2.147286821705426, "grad_norm": 0.09284122288227081, "learning_rate": 2.940098193998391e-05, "loss": 0.8441802859306335, "step": 554 }, { "epoch": 2.1550387596899223, "grad_norm": 0.07503140717744827, "learning_rate": 2.9297950019041724e-05, "loss": 0.4028940498828888, "step": 556 }, { "epoch": 2.1627906976744184, "grad_norm": 0.11651087552309036, "learning_rate": 2.9186994600656647e-05, "loss": 0.6657426953315735, "step": 558 }, { "epoch": 2.1705426356589146, "grad_norm": 0.06494183093309402, "learning_rate": 2.906818865616178e-05, "loss": 0.5439774990081787, "step": 560 }, { "epoch": 2.1782945736434107, "grad_norm": 0.05145857110619545, "learning_rate": 2.8941610319894977e-05, "loss": 0.7213448882102966, "step": 562 }, { "epoch": 2.186046511627907, "grad_norm": 0.1473415493965149, "learning_rate": 2.8807342837812783e-05, "loss": 0.38557326793670654, "step": 564 }, { "epoch": 2.193798449612403, "grad_norm": 0.2709689438343048, "learning_rate": 2.8665474512742603e-05, "loss": 0.41664543747901917, "step": 566 }, { "epoch": 2.201550387596899, "grad_norm": 0.06767801940441132, "learning_rate": 2.851609864630911e-05, "loss": 0.4579377770423889, "step": 568 }, { "epoch": 2.2093023255813953, "grad_norm": 0.3255118727684021, "learning_rate": 2.8359313477573215e-05, "loss": 0.3196179270744324, "step": 570 }, { "epoch": 2.2170542635658914, "grad_norm": 0.1096249520778656, "learning_rate": 2.8195222118423792e-05, "loss": 0.5369107127189636, "step": 572 }, { "epoch": 2.2248062015503876, "grad_norm": 0.2894248068332672, "learning_rate": 2.8023932485764768e-05, "loss": 0.23676389455795288, "step": 574 }, { "epoch": 2.2325581395348837, "grad_norm": 0.19947735965251923, "learning_rate": 2.7845557230542076e-05, "loss": 0.44901129603385925, "step": 576 }, { "epoch": 2.24031007751938, "grad_norm": 0.06506390869617462, "learning_rate": 2.766021366365729e-05, "loss": 0.5859266519546509, "step": 578 }, { "epoch": 2.248062015503876, "grad_norm": 0.10611079633235931, "learning_rate": 2.746802367881645e-05, "loss": 0.6005488038063049, "step": 580 }, { "epoch": 2.255813953488372, "grad_norm": 0.05949712544679642, "learning_rate": 2.726911367236509e-05, "loss": 0.32260704040527344, "step": 582 }, { "epoch": 2.2635658914728682, "grad_norm": 0.09240850806236267, "learning_rate": 2.706361446016193e-05, "loss": 0.8233704566955566, "step": 584 }, { "epoch": 2.2713178294573644, "grad_norm": 0.08874181658029556, "learning_rate": 2.685166119154604e-05, "loss": 0.4317566156387329, "step": 586 }, { "epoch": 2.2790697674418605, "grad_norm": 0.05373215302824974, "learning_rate": 2.6633393260454096e-05, "loss": 0.8105683326721191, "step": 588 }, { "epoch": 2.2868217054263567, "grad_norm": 0.05979755148291588, "learning_rate": 2.6408954213746025e-05, "loss": 0.4510256350040436, "step": 590 }, { "epoch": 2.294573643410853, "grad_norm": 0.056298933923244476, "learning_rate": 2.6178491656799504e-05, "loss": 0.7199202179908752, "step": 592 }, { "epoch": 2.302325581395349, "grad_norm": 0.06022209674119949, "learning_rate": 2.5942157156435248e-05, "loss": 0.47333112359046936, "step": 594 }, { "epoch": 2.310077519379845, "grad_norm": 0.1291632205247879, "learning_rate": 2.570010614123707e-05, "loss": 0.4947061836719513, "step": 596 }, { "epoch": 2.317829457364341, "grad_norm": 0.7499107718467712, "learning_rate": 2.5452497799332167e-05, "loss": 0.6046218872070312, "step": 598 }, { "epoch": 2.3255813953488373, "grad_norm": 0.05242902785539627, "learning_rate": 2.519949497369886e-05, "loss": 0.37087422609329224, "step": 600 }, { "epoch": 2.3333333333333335, "grad_norm": 0.4367184340953827, "learning_rate": 2.494126405507074e-05, "loss": 0.579389214515686, "step": 602 }, { "epoch": 2.3410852713178296, "grad_norm": 0.0486953966319561, "learning_rate": 2.467797487250756e-05, "loss": 0.7329738736152649, "step": 604 }, { "epoch": 2.3488372093023258, "grad_norm": 0.09891688823699951, "learning_rate": 2.4409800581704784e-05, "loss": 0.5676310658454895, "step": 606 }, { "epoch": 2.356589147286822, "grad_norm": 0.12641683220863342, "learning_rate": 2.4136917551115484e-05, "loss": 0.6383396983146667, "step": 608 }, { "epoch": 2.3643410852713176, "grad_norm": 0.07655075937509537, "learning_rate": 2.3859505245959206e-05, "loss": 0.6663593053817749, "step": 610 }, { "epoch": 2.3720930232558137, "grad_norm": 0.062206994742155075, "learning_rate": 2.3577746110194188e-05, "loss": 0.32523995637893677, "step": 612 }, { "epoch": 2.37984496124031, "grad_norm": 0.13163718581199646, "learning_rate": 2.329182544653074e-05, "loss": 0.5087898373603821, "step": 614 }, { "epoch": 2.387596899224806, "grad_norm": 0.04577813297510147, "learning_rate": 2.3001931294564278e-05, "loss": 0.5215730667114258, "step": 616 }, { "epoch": 2.395348837209302, "grad_norm": 0.06540275365114212, "learning_rate": 2.27082543071086e-05, "loss": 0.7069303393363953, "step": 618 }, { "epoch": 2.4031007751937983, "grad_norm": 0.04587893187999725, "learning_rate": 2.2410987624810527e-05, "loss": 0.6097102165222168, "step": 620 }, { "epoch": 2.4108527131782944, "grad_norm": 0.18531644344329834, "learning_rate": 2.2110326749128246e-05, "loss": 0.28449299931526184, "step": 622 }, { "epoch": 2.4186046511627906, "grad_norm": 0.06915592402219772, "learning_rate": 2.180646941375716e-05, "loss": 0.5394483208656311, "step": 624 }, { "epoch": 2.4263565891472867, "grad_norm": 0.0683450847864151, "learning_rate": 2.149961545458774e-05, "loss": 0.351560115814209, "step": 626 }, { "epoch": 2.434108527131783, "grad_norm": 0.0661771222949028, "learning_rate": 2.1189966678280585e-05, "loss": 0.6790451407432556, "step": 628 }, { "epoch": 2.441860465116279, "grad_norm": 0.2682180106639862, "learning_rate": 2.0877726729545665e-05, "loss": 0.34560778737068176, "step": 630 }, { "epoch": 2.449612403100775, "grad_norm": 0.05607810616493225, "learning_rate": 2.0563100957212584e-05, "loss": 0.35299909114837646, "step": 632 }, { "epoch": 2.4573643410852712, "grad_norm": 0.1276787519454956, "learning_rate": 2.02462962791801e-05, "loss": 0.45075708627700806, "step": 634 }, { "epoch": 2.4651162790697674, "grad_norm": 0.07231509685516357, "learning_rate": 1.9927521046333833e-05, "loss": 0.4892677664756775, "step": 636 }, { "epoch": 2.4728682170542635, "grad_norm": 0.12232723832130432, "learning_rate": 1.9606984905521463e-05, "loss": 0.6066938042640686, "step": 638 }, { "epoch": 2.4806201550387597, "grad_norm": 0.054693497717380524, "learning_rate": 1.928489866167559e-05, "loss": 0.3974202275276184, "step": 640 }, { "epoch": 2.488372093023256, "grad_norm": 0.07348073273897171, "learning_rate": 1.896147413917511e-05, "loss": 0.43941450119018555, "step": 642 }, { "epoch": 2.496124031007752, "grad_norm": 0.05807847902178764, "learning_rate": 1.863692404253597e-05, "loss": 0.5508748888969421, "step": 644 }, { "epoch": 2.503875968992248, "grad_norm": 0.08628101646900177, "learning_rate": 1.83114618165232e-05, "loss": 0.5954611897468567, "step": 646 }, { "epoch": 2.511627906976744, "grad_norm": 0.08698024600744247, "learning_rate": 1.798530150577603e-05, "loss": 0.7873520851135254, "step": 648 }, { "epoch": 2.5193798449612403, "grad_norm": 0.0802086666226387, "learning_rate": 1.765865761403861e-05, "loss": 0.27345526218414307, "step": 650 }, { "epoch": 2.5271317829457365, "grad_norm": 0.058408260345458984, "learning_rate": 1.7331744963088654e-05, "loss": 0.5833812355995178, "step": 652 }, { "epoch": 2.5348837209302326, "grad_norm": 0.10947899520397186, "learning_rate": 1.7004778551456995e-05, "loss": 0.3762988746166229, "step": 654 }, { "epoch": 2.5426356589147288, "grad_norm": 0.08571284264326096, "learning_rate": 1.667797341303094e-05, "loss": 0.5067244172096252, "step": 656 }, { "epoch": 2.550387596899225, "grad_norm": 0.06394554674625397, "learning_rate": 1.6351544475634277e-05, "loss": 0.42985814809799194, "step": 658 }, { "epoch": 2.558139534883721, "grad_norm": 0.1848040074110031, "learning_rate": 1.6025706419677054e-05, "loss": 0.8337141871452332, "step": 660 }, { "epoch": 2.565891472868217, "grad_norm": 0.04375322908163071, "learning_rate": 1.570067353696823e-05, "loss": 0.5003541707992554, "step": 662 }, { "epoch": 2.5736434108527133, "grad_norm": 0.04600893706083298, "learning_rate": 1.5376659589783585e-05, "loss": 0.3022569715976715, "step": 664 }, { "epoch": 2.5813953488372094, "grad_norm": 0.05343756452202797, "learning_rate": 1.5053877670282193e-05, "loss": 0.4718426465988159, "step": 666 }, { "epoch": 2.5891472868217056, "grad_norm": 0.09041419625282288, "learning_rate": 1.473254006036345e-05, "loss": 0.4901648163795471, "step": 668 }, { "epoch": 2.5968992248062017, "grad_norm": 0.06343325972557068, "learning_rate": 1.4412858092056995e-05, "loss": 0.6914687156677246, "step": 670 }, { "epoch": 2.604651162790698, "grad_norm": 0.06813778728246689, "learning_rate": 1.4095042008537343e-05, "loss": 0.4894769787788391, "step": 672 }, { "epoch": 2.612403100775194, "grad_norm": 0.1439589262008667, "learning_rate": 1.3779300825854615e-05, "loss": 0.7514118552207947, "step": 674 }, { "epoch": 2.62015503875969, "grad_norm": 0.07022061944007874, "learning_rate": 1.3465842195472315e-05, "loss": 0.7393191456794739, "step": 676 }, { "epoch": 2.6279069767441863, "grad_norm": 0.07147393375635147, "learning_rate": 1.3154872267702535e-05, "loss": 0.8212107419967651, "step": 678 }, { "epoch": 2.6356589147286824, "grad_norm": 0.06350687146186829, "learning_rate": 1.2846595556128338e-05, "loss": 0.7656596302986145, "step": 680 }, { "epoch": 2.6434108527131785, "grad_norm": 0.1861017942428589, "learning_rate": 1.2541214803102764e-05, "loss": 0.39778298139572144, "step": 682 }, { "epoch": 2.6511627906976747, "grad_norm": 0.12844400107860565, "learning_rate": 1.2238930846412478e-05, "loss": 0.4492897689342499, "step": 684 }, { "epoch": 2.6589147286821704, "grad_norm": 0.09717841446399689, "learning_rate": 1.1939942487194114e-05, "loss": 0.5477796792984009, "step": 686 }, { "epoch": 2.6666666666666665, "grad_norm": 0.0593947097659111, "learning_rate": 1.1644446359190002e-05, "loss": 0.28585392236709595, "step": 688 }, { "epoch": 2.6744186046511627, "grad_norm": 0.045567888766527176, "learning_rate": 1.1352636799429364e-05, "loss": 0.5053625106811523, "step": 690 }, { "epoch": 2.682170542635659, "grad_norm": 0.05959075689315796, "learning_rate": 1.1064705720419824e-05, "loss": 0.567241370677948, "step": 692 }, { "epoch": 2.689922480620155, "grad_norm": 0.15132786333560944, "learning_rate": 1.0780842483933762e-05, "loss": 0.6684018969535828, "step": 694 }, { "epoch": 2.697674418604651, "grad_norm": 0.08239150047302246, "learning_rate": 1.0501233776471719e-05, "loss": 0.33106908202171326, "step": 696 }, { "epoch": 2.705426356589147, "grad_norm": 0.06124155595898628, "learning_rate": 1.0226063486485696e-05, "loss": 0.566682755947113, "step": 698 }, { "epoch": 2.7131782945736433, "grad_norm": 0.07035624980926514, "learning_rate": 9.955512583442338e-06, "loss": 0.4341398775577545, "step": 700 }, { "epoch": 2.7209302325581395, "grad_norm": 0.05901051685214043, "learning_rate": 9.689758998805937e-06, "loss": 0.4164765775203705, "step": 702 }, { "epoch": 2.7286821705426356, "grad_norm": 0.04497726634144783, "learning_rate": 9.428977509019321e-06, "loss": 0.40749120712280273, "step": 704 }, { "epoch": 2.7364341085271318, "grad_norm": 0.15438151359558105, "learning_rate": 9.173339620559945e-06, "loss": 0.28900110721588135, "step": 706 }, { "epoch": 2.744186046511628, "grad_norm": 0.05592001974582672, "learning_rate": 8.923013457146072e-06, "loss": 0.41211241483688354, "step": 708 }, { "epoch": 2.751937984496124, "grad_norm": 0.0629056990146637, "learning_rate": 8.678163649168217e-06, "loss": 0.5537896156311035, "step": 710 }, { "epoch": 2.75968992248062, "grad_norm": 0.06699282675981522, "learning_rate": 8.43895122541748e-06, "loss": 0.5788278579711914, "step": 712 }, { "epoch": 2.7674418604651163, "grad_norm": 0.13577990233898163, "learning_rate": 8.205533507182964e-06, "loss": 0.37125617265701294, "step": 714 }, { "epoch": 2.7751937984496124, "grad_norm": 0.16210103034973145, "learning_rate": 7.978064004787233e-06, "loss": 0.3962320387363434, "step": 716 }, { "epoch": 2.7829457364341086, "grad_norm": 0.06700747460126877, "learning_rate": 7.756692316628171e-06, "loss": 0.6869024634361267, "step": 718 }, { "epoch": 2.7906976744186047, "grad_norm": 0.06708226352930069, "learning_rate": 7.541564030793529e-06, "loss": 0.5122371912002563, "step": 720 }, { "epoch": 2.798449612403101, "grad_norm": 0.0619942843914032, "learning_rate": 7.332820629313089e-06, "loss": 0.4030957818031311, "step": 722 }, { "epoch": 2.806201550387597, "grad_norm": 0.08853983879089355, "learning_rate": 7.1305993951108914e-06, "loss": 0.4579683840274811, "step": 724 }, { "epoch": 2.813953488372093, "grad_norm": 0.1136261448264122, "learning_rate": 6.935033321719423e-06, "loss": 0.4582154452800751, "step": 726 }, { "epoch": 2.8217054263565893, "grad_norm": 0.03374806419014931, "learning_rate": 6.74625102581455e-06, "loss": 0.46171411871910095, "step": 728 }, { "epoch": 2.8294573643410854, "grad_norm": 0.05085311084985733, "learning_rate": 6.56437666262903e-06, "loss": 0.4829785227775574, "step": 730 }, { "epoch": 2.8372093023255816, "grad_norm": 0.051897477358579636, "learning_rate": 6.389529844300143e-06, "loss": 0.4446869194507599, "step": 732 }, { "epoch": 2.8449612403100772, "grad_norm": 0.06380399316549301, "learning_rate": 6.221825561205165e-06, "loss": 0.5170708298683167, "step": 734 }, { "epoch": 2.8527131782945734, "grad_norm": 0.07282527536153793, "learning_rate": 6.061374106336333e-06, "loss": 0.6230844259262085, "step": 736 }, { "epoch": 2.8604651162790695, "grad_norm": 0.09063038229942322, "learning_rate": 5.908281002765252e-06, "loss": 0.35932058095932007, "step": 738 }, { "epoch": 2.8682170542635657, "grad_norm": 1.006274938583374, "learning_rate": 5.762646934244159e-06, "loss": 0.3806362748146057, "step": 740 }, { "epoch": 2.875968992248062, "grad_norm": 0.3232397139072418, "learning_rate": 5.624567678989899e-06, "loss": 0.513190507888794, "step": 742 }, { "epoch": 2.883720930232558, "grad_norm": 0.120949886739254, "learning_rate": 5.494134046694099e-06, "loss": 0.6526894569396973, "step": 744 }, { "epoch": 2.891472868217054, "grad_norm": 0.12644588947296143, "learning_rate": 5.371431818800933e-06, "loss": 0.4791458249092102, "step": 746 }, { "epoch": 2.89922480620155, "grad_norm": 0.06687454879283905, "learning_rate": 5.256541692091802e-06, "loss": 0.5770004987716675, "step": 748 }, { "epoch": 2.9069767441860463, "grad_norm": 0.08843245357275009, "learning_rate": 5.149539225613978e-06, "loss": 0.3434167802333832, "step": 750 }, { "epoch": 2.9147286821705425, "grad_norm": 0.07200266420841217, "learning_rate": 5.050494790988215e-06, "loss": 0.4575299322605133, "step": 752 }, { "epoch": 2.9224806201550386, "grad_norm": 0.07060275971889496, "learning_rate": 4.959473526127871e-06, "loss": 0.3564453721046448, "step": 754 }, { "epoch": 2.9302325581395348, "grad_norm": 0.06662537902593613, "learning_rate": 4.876535292400089e-06, "loss": 0.7428521513938904, "step": 756 }, { "epoch": 2.937984496124031, "grad_norm": 0.05963343381881714, "learning_rate": 4.801734635257146e-06, "loss": 0.4571719169616699, "step": 758 }, { "epoch": 2.945736434108527, "grad_norm": 0.05507909134030342, "learning_rate": 4.73512074836392e-06, "loss": 0.5132399797439575, "step": 760 }, { "epoch": 2.953488372093023, "grad_norm": 0.05289539694786072, "learning_rate": 4.676737441244973e-06, "loss": 0.814540445804596, "step": 762 }, { "epoch": 2.9612403100775193, "grad_norm": 0.05587043985724449, "learning_rate": 4.626623110472678e-06, "loss": 0.5996021628379822, "step": 764 }, { "epoch": 2.9689922480620154, "grad_norm": 0.0820513367652893, "learning_rate": 4.584810714415136e-06, "loss": 0.2337801605463028, "step": 766 }, { "epoch": 2.9767441860465116, "grad_norm": 0.08467745780944824, "learning_rate": 4.551327751560703e-06, "loss": 0.43569573760032654, "step": 768 }, { "epoch": 2.9844961240310077, "grad_norm": 0.09394794702529907, "learning_rate": 4.526196242433211e-06, "loss": 0.42782190442085266, "step": 770 }, { "epoch": 2.992248062015504, "grad_norm": 0.05439142882823944, "learning_rate": 4.509432715109889e-06, "loss": 0.516304612159729, "step": 772 }, { "epoch": 3.0, "grad_norm": 0.03667069226503372, "learning_rate": 4.50104819435143e-06, "loss": 0.15898612141609192, "step": 774 }, { "epoch": 3.0, "step": 774, "total_flos": 3.2487544184132076e+18, "train_loss": 0.8073726282178277, "train_runtime": 15483.8831, "train_samples_per_second": 3.199, "train_steps_per_second": 0.05 } ], "logging_steps": 2, "max_steps": 774, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 99999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.2487544184132076e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }