{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9989708404802744, "eval_steps": 500, "global_step": 728, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 9.090909090909091e-07, "loss": 1.4022, "step": 3 }, { "epoch": 0.01, "learning_rate": 1.8181818181818183e-06, "loss": 1.4239, "step": 6 }, { "epoch": 0.01, "learning_rate": 2.7272727272727272e-06, "loss": 1.3843, "step": 9 }, { "epoch": 0.02, "learning_rate": 3.6363636363636366e-06, "loss": 1.3722, "step": 12 }, { "epoch": 0.02, "learning_rate": 4.5454545454545455e-06, "loss": 1.3411, "step": 15 }, { "epoch": 0.02, "learning_rate": 5.4545454545454545e-06, "loss": 1.3187, "step": 18 }, { "epoch": 0.03, "learning_rate": 6.363636363636364e-06, "loss": 1.284, "step": 21 }, { "epoch": 0.03, "learning_rate": 7.272727272727273e-06, "loss": 1.2492, "step": 24 }, { "epoch": 0.04, "learning_rate": 8.181818181818183e-06, "loss": 1.2658, "step": 27 }, { "epoch": 0.04, "learning_rate": 9.090909090909091e-06, "loss": 1.2173, "step": 30 }, { "epoch": 0.05, "learning_rate": 1e-05, "loss": 1.2302, "step": 33 }, { "epoch": 0.05, "learning_rate": 1.0909090909090909e-05, "loss": 1.2301, "step": 36 }, { "epoch": 0.05, "learning_rate": 1.181818181818182e-05, "loss": 1.1855, "step": 39 }, { "epoch": 0.06, "learning_rate": 1.2727272727272728e-05, "loss": 1.2094, "step": 42 }, { "epoch": 0.06, "learning_rate": 1.3636363636363637e-05, "loss": 1.1788, "step": 45 }, { "epoch": 0.07, "learning_rate": 1.4545454545454546e-05, "loss": 1.1804, "step": 48 }, { "epoch": 0.07, "learning_rate": 1.5454545454545454e-05, "loss": 1.166, "step": 51 }, { "epoch": 0.07, "learning_rate": 1.6363636363636366e-05, "loss": 1.1256, "step": 54 }, { "epoch": 0.08, "learning_rate": 1.7272727272727274e-05, "loss": 1.1289, "step": 57 }, { "epoch": 0.08, "learning_rate": 1.8181818181818182e-05, "loss": 1.1392, "step": 60 }, { "epoch": 0.09, "learning_rate": 1.9090909090909094e-05, "loss": 1.131, "step": 63 }, { "epoch": 0.09, "learning_rate": 2e-05, "loss": 1.1288, "step": 66 }, { "epoch": 0.09, "learning_rate": 1.9999900994429424e-05, "loss": 1.1198, "step": 69 }, { "epoch": 0.1, "learning_rate": 1.999960397967811e-05, "loss": 1.1281, "step": 72 }, { "epoch": 0.1, "learning_rate": 1.9999108961627284e-05, "loss": 1.134, "step": 75 }, { "epoch": 0.11, "learning_rate": 1.9998415950078858e-05, "loss": 1.1148, "step": 78 }, { "epoch": 0.11, "learning_rate": 1.9997524958755226e-05, "loss": 1.1162, "step": 81 }, { "epoch": 0.12, "learning_rate": 1.9996436005299013e-05, "loss": 1.12, "step": 84 }, { "epoch": 0.12, "learning_rate": 1.999514911127271e-05, "loss": 1.12, "step": 87 }, { "epoch": 0.12, "learning_rate": 1.9993664302158255e-05, "loss": 1.0938, "step": 90 }, { "epoch": 0.13, "learning_rate": 1.9991981607356517e-05, "loss": 1.0838, "step": 93 }, { "epoch": 0.13, "learning_rate": 1.9990101060186732e-05, "loss": 1.1078, "step": 96 }, { "epoch": 0.14, "learning_rate": 1.998802269788583e-05, "loss": 1.1037, "step": 99 }, { "epoch": 0.14, "learning_rate": 1.9985746561607696e-05, "loss": 1.0804, "step": 102 }, { "epoch": 0.14, "learning_rate": 1.998327269642237e-05, "loss": 1.0977, "step": 105 }, { "epoch": 0.15, "learning_rate": 1.998060115131513e-05, "loss": 1.1036, "step": 108 }, { "epoch": 0.15, "learning_rate": 1.9977731979185556e-05, "loss": 1.1109, "step": 111 }, { "epoch": 0.16, "learning_rate": 1.9974665236846443e-05, "loss": 1.0937, "step": 114 }, { "epoch": 0.16, "learning_rate": 1.9971400985022712e-05, "loss": 1.0834, "step": 117 }, { "epoch": 0.16, "learning_rate": 1.9967939288350184e-05, "loss": 1.1002, "step": 120 }, { "epoch": 0.17, "learning_rate": 1.9964280215374312e-05, "loss": 1.0847, "step": 123 }, { "epoch": 0.17, "learning_rate": 1.9960423838548814e-05, "loss": 1.0845, "step": 126 }, { "epoch": 0.18, "learning_rate": 1.995637023423425e-05, "loss": 1.0984, "step": 129 }, { "epoch": 0.18, "learning_rate": 1.9952119482696504e-05, "loss": 1.0836, "step": 132 }, { "epoch": 0.19, "learning_rate": 1.9947671668105185e-05, "loss": 1.082, "step": 135 }, { "epoch": 0.19, "learning_rate": 1.9943026878531985e-05, "loss": 1.0707, "step": 138 }, { "epoch": 0.19, "learning_rate": 1.9938185205948906e-05, "loss": 1.0545, "step": 141 }, { "epoch": 0.2, "learning_rate": 1.993314674622646e-05, "loss": 1.0618, "step": 144 }, { "epoch": 0.2, "learning_rate": 1.992791159913177e-05, "loss": 1.0514, "step": 147 }, { "epoch": 0.21, "learning_rate": 1.992247986832658e-05, "loss": 1.0733, "step": 150 }, { "epoch": 0.21, "learning_rate": 1.99168516613652e-05, "loss": 1.0712, "step": 153 }, { "epoch": 0.21, "learning_rate": 1.991102708969241e-05, "loss": 1.0788, "step": 156 }, { "epoch": 0.22, "learning_rate": 1.9905006268641212e-05, "loss": 1.0744, "step": 159 }, { "epoch": 0.22, "learning_rate": 1.9898789317430577e-05, "loss": 1.0621, "step": 162 }, { "epoch": 0.23, "learning_rate": 1.9892376359163058e-05, "loss": 1.0598, "step": 165 }, { "epoch": 0.23, "learning_rate": 1.9885767520822377e-05, "loss": 1.095, "step": 168 }, { "epoch": 0.23, "learning_rate": 1.9878962933270896e-05, "loss": 1.0666, "step": 171 }, { "epoch": 0.24, "learning_rate": 1.987196273124703e-05, "loss": 1.0657, "step": 174 }, { "epoch": 0.24, "learning_rate": 1.986476705336258e-05, "loss": 1.0691, "step": 177 }, { "epoch": 0.25, "learning_rate": 1.9857376042099982e-05, "loss": 1.0663, "step": 180 }, { "epoch": 0.25, "learning_rate": 1.9849789843809496e-05, "loss": 1.0476, "step": 183 }, { "epoch": 0.26, "learning_rate": 1.9842008608706295e-05, "loss": 1.0509, "step": 186 }, { "epoch": 0.26, "learning_rate": 1.983403249086751e-05, "loss": 1.0622, "step": 189 }, { "epoch": 0.26, "learning_rate": 1.9825861648229154e-05, "loss": 1.0708, "step": 192 }, { "epoch": 0.27, "learning_rate": 1.981749624258302e-05, "loss": 1.0672, "step": 195 }, { "epoch": 0.27, "learning_rate": 1.9808936439573455e-05, "loss": 1.0627, "step": 198 }, { "epoch": 0.28, "learning_rate": 1.9800182408694096e-05, "loss": 1.0726, "step": 201 }, { "epoch": 0.28, "learning_rate": 1.9791234323284515e-05, "loss": 1.0558, "step": 204 }, { "epoch": 0.28, "learning_rate": 1.9782092360526763e-05, "loss": 1.0677, "step": 207 }, { "epoch": 0.29, "learning_rate": 1.977275670144189e-05, "loss": 1.0422, "step": 210 }, { "epoch": 0.29, "learning_rate": 1.9763227530886348e-05, "loss": 1.0364, "step": 213 }, { "epoch": 0.3, "learning_rate": 1.9753505037548334e-05, "loss": 1.0475, "step": 216 }, { "epoch": 0.3, "learning_rate": 1.974358941394404e-05, "loss": 1.0508, "step": 219 }, { "epoch": 0.3, "learning_rate": 1.973348085641387e-05, "loss": 1.0595, "step": 222 }, { "epoch": 0.31, "learning_rate": 1.972317956511852e-05, "loss": 1.0528, "step": 225 }, { "epoch": 0.31, "learning_rate": 1.971268574403503e-05, "loss": 1.0562, "step": 228 }, { "epoch": 0.32, "learning_rate": 1.970199960095276e-05, "loss": 1.0329, "step": 231 }, { "epoch": 0.32, "learning_rate": 1.9691121347469235e-05, "loss": 1.045, "step": 234 }, { "epoch": 0.33, "learning_rate": 1.9680051198986004e-05, "loss": 1.0561, "step": 237 }, { "epoch": 0.33, "learning_rate": 1.9668789374704337e-05, "loss": 1.0449, "step": 240 }, { "epoch": 0.33, "learning_rate": 1.9657336097620904e-05, "loss": 1.0359, "step": 243 }, { "epoch": 0.34, "learning_rate": 1.964569159452335e-05, "loss": 1.0359, "step": 246 }, { "epoch": 0.34, "learning_rate": 1.963385609598581e-05, "loss": 1.0271, "step": 249 }, { "epoch": 0.35, "learning_rate": 1.9621829836364335e-05, "loss": 1.0563, "step": 252 }, { "epoch": 0.35, "learning_rate": 1.9609613053792276e-05, "loss": 1.0416, "step": 255 }, { "epoch": 0.35, "learning_rate": 1.9597205990175528e-05, "loss": 1.0578, "step": 258 }, { "epoch": 0.36, "learning_rate": 1.958460889118778e-05, "loss": 1.0461, "step": 261 }, { "epoch": 0.36, "learning_rate": 1.9571822006265623e-05, "loss": 1.0262, "step": 264 }, { "epoch": 0.37, "learning_rate": 1.9558845588603625e-05, "loss": 1.0254, "step": 267 }, { "epoch": 0.37, "learning_rate": 1.9545679895149315e-05, "loss": 1.0642, "step": 270 }, { "epoch": 0.37, "learning_rate": 1.9532325186598093e-05, "loss": 1.0456, "step": 273 }, { "epoch": 0.38, "learning_rate": 1.951878172738806e-05, "loss": 1.0358, "step": 276 }, { "epoch": 0.38, "learning_rate": 1.9505049785694803e-05, "loss": 1.0409, "step": 279 }, { "epoch": 0.39, "learning_rate": 1.9491129633426068e-05, "loss": 1.0382, "step": 282 }, { "epoch": 0.39, "learning_rate": 1.9477021546216376e-05, "loss": 1.0415, "step": 285 }, { "epoch": 0.4, "learning_rate": 1.9462725803421566e-05, "loss": 1.0308, "step": 288 }, { "epoch": 0.4, "learning_rate": 1.9448242688113286e-05, "loss": 1.0376, "step": 291 }, { "epoch": 0.4, "learning_rate": 1.9433572487073343e-05, "loss": 1.0259, "step": 294 }, { "epoch": 0.41, "learning_rate": 1.9418715490788066e-05, "loss": 1.0496, "step": 297 }, { "epoch": 0.41, "learning_rate": 1.9403671993442534e-05, "loss": 1.0519, "step": 300 }, { "epoch": 0.42, "learning_rate": 1.9388442292914754e-05, "loss": 1.0418, "step": 303 }, { "epoch": 0.42, "learning_rate": 1.937302669076976e-05, "loss": 1.0372, "step": 306 }, { "epoch": 0.42, "learning_rate": 1.9357425492253662e-05, "loss": 1.0347, "step": 309 }, { "epoch": 0.43, "learning_rate": 1.934163900628756e-05, "loss": 1.0253, "step": 312 }, { "epoch": 0.43, "learning_rate": 1.9325667545461466e-05, "loss": 1.0401, "step": 315 }, { "epoch": 0.44, "learning_rate": 1.9309511426028105e-05, "loss": 1.0282, "step": 318 }, { "epoch": 0.44, "learning_rate": 1.9293170967896632e-05, "loss": 1.0306, "step": 321 }, { "epoch": 0.44, "learning_rate": 1.9276646494626333e-05, "loss": 1.0313, "step": 324 }, { "epoch": 0.45, "learning_rate": 1.9259938333420183e-05, "loss": 1.0433, "step": 327 }, { "epoch": 0.45, "learning_rate": 1.9243046815118387e-05, "loss": 1.0232, "step": 330 }, { "epoch": 0.46, "learning_rate": 1.922597227419183e-05, "loss": 1.0222, "step": 333 }, { "epoch": 0.46, "learning_rate": 1.9208715048735446e-05, "loss": 1.0186, "step": 336 }, { "epoch": 0.47, "learning_rate": 1.9191275480461525e-05, "loss": 1.033, "step": 339 }, { "epoch": 0.47, "learning_rate": 1.9173653914692947e-05, "loss": 1.0342, "step": 342 }, { "epoch": 0.47, "learning_rate": 1.9155850700356345e-05, "loss": 1.035, "step": 345 }, { "epoch": 0.48, "learning_rate": 1.91378661899752e-05, "loss": 1.0206, "step": 348 }, { "epoch": 0.48, "learning_rate": 1.9119700739662857e-05, "loss": 1.0435, "step": 351 }, { "epoch": 0.49, "learning_rate": 1.910135470911547e-05, "loss": 1.0181, "step": 354 }, { "epoch": 0.49, "learning_rate": 1.908282846160488e-05, "loss": 1.0267, "step": 357 }, { "epoch": 0.49, "learning_rate": 1.9064122363971426e-05, "loss": 1.0365, "step": 360 }, { "epoch": 0.5, "learning_rate": 1.904523678661669e-05, "loss": 1.0381, "step": 363 }, { "epoch": 0.5, "learning_rate": 1.9026172103496138e-05, "loss": 1.0048, "step": 366 }, { "epoch": 0.51, "learning_rate": 1.900692869211174e-05, "loss": 1.0392, "step": 369 }, { "epoch": 0.51, "learning_rate": 1.898750693350447e-05, "loss": 1.0278, "step": 372 }, { "epoch": 0.51, "learning_rate": 1.8967907212246803e-05, "loss": 1.013, "step": 375 }, { "epoch": 0.52, "learning_rate": 1.8948129916435048e-05, "loss": 1.0385, "step": 378 }, { "epoch": 0.52, "learning_rate": 1.8928175437681698e-05, "loss": 1.0168, "step": 381 }, { "epoch": 0.53, "learning_rate": 1.8908044171107658e-05, "loss": 1.0123, "step": 384 }, { "epoch": 0.53, "learning_rate": 1.8887736515334443e-05, "loss": 1.015, "step": 387 }, { "epoch": 0.54, "learning_rate": 1.8867252872476255e-05, "loss": 1.0265, "step": 390 }, { "epoch": 0.54, "learning_rate": 1.884659364813205e-05, "loss": 0.9997, "step": 393 }, { "epoch": 0.54, "learning_rate": 1.8825759251377484e-05, "loss": 1.0109, "step": 396 }, { "epoch": 0.55, "learning_rate": 1.8804750094756827e-05, "loss": 1.0199, "step": 399 }, { "epoch": 0.55, "learning_rate": 1.8783566594274783e-05, "loss": 0.9998, "step": 402 }, { "epoch": 0.56, "learning_rate": 1.8762209169388262e-05, "loss": 1.0088, "step": 405 }, { "epoch": 0.56, "learning_rate": 1.8740678242998077e-05, "loss": 1.0022, "step": 408 }, { "epoch": 0.56, "learning_rate": 1.8718974241440552e-05, "loss": 1.0216, "step": 411 }, { "epoch": 0.57, "learning_rate": 1.8697097594479103e-05, "loss": 1.0248, "step": 414 }, { "epoch": 0.57, "learning_rate": 1.867504873529571e-05, "loss": 0.9974, "step": 417 }, { "epoch": 0.58, "learning_rate": 1.865282810048235e-05, "loss": 1.0138, "step": 420 }, { "epoch": 0.58, "learning_rate": 1.8630436130032353e-05, "loss": 1.0004, "step": 423 }, { "epoch": 0.58, "learning_rate": 1.860787326733168e-05, "loss": 1.0081, "step": 426 }, { "epoch": 0.59, "learning_rate": 1.8585139959150144e-05, "loss": 1.0238, "step": 429 }, { "epoch": 0.59, "learning_rate": 1.856223665563258e-05, "loss": 1.0328, "step": 432 }, { "epoch": 0.6, "learning_rate": 1.8539163810289914e-05, "loss": 1.0071, "step": 435 }, { "epoch": 0.6, "learning_rate": 1.8515921879990187e-05, "loss": 1.0134, "step": 438 }, { "epoch": 0.61, "learning_rate": 1.8492511324949516e-05, "loss": 1.0181, "step": 441 }, { "epoch": 0.61, "learning_rate": 1.8468932608722975e-05, "loss": 1.0363, "step": 444 }, { "epoch": 0.61, "learning_rate": 1.8445186198195406e-05, "loss": 1.0011, "step": 447 }, { "epoch": 0.62, "learning_rate": 1.8421272563572202e-05, "loss": 0.9993, "step": 450 }, { "epoch": 0.62, "learning_rate": 1.8397192178369965e-05, "loss": 1.0201, "step": 453 }, { "epoch": 0.63, "learning_rate": 1.837294551940716e-05, "loss": 0.987, "step": 456 }, { "epoch": 0.63, "learning_rate": 1.834853306679464e-05, "loss": 1.0106, "step": 459 }, { "epoch": 0.63, "learning_rate": 1.8323955303926165e-05, "loss": 1.0034, "step": 462 }, { "epoch": 0.64, "learning_rate": 1.8299212717468825e-05, "loss": 1.0095, "step": 465 }, { "epoch": 0.64, "learning_rate": 1.8274305797353397e-05, "loss": 0.9921, "step": 468 }, { "epoch": 0.65, "learning_rate": 1.824923503676465e-05, "loss": 0.9859, "step": 471 }, { "epoch": 0.65, "learning_rate": 1.822400093213157e-05, "loss": 1.017, "step": 474 }, { "epoch": 0.65, "learning_rate": 1.8198603983117546e-05, "loss": 1.0118, "step": 477 }, { "epoch": 0.66, "learning_rate": 1.8173044692610466e-05, "loss": 0.9912, "step": 480 }, { "epoch": 0.66, "learning_rate": 1.8147323566712755e-05, "loss": 1.0162, "step": 483 }, { "epoch": 0.67, "learning_rate": 1.8121441114731366e-05, "loss": 1.0089, "step": 486 }, { "epoch": 0.67, "learning_rate": 1.809539784916768e-05, "loss": 0.9752, "step": 489 }, { "epoch": 0.68, "learning_rate": 1.806919428570737e-05, "loss": 1.007, "step": 492 }, { "epoch": 0.68, "learning_rate": 1.804283094321019e-05, "loss": 1.0145, "step": 495 }, { "epoch": 0.68, "learning_rate": 1.8016308343699686e-05, "loss": 1.0008, "step": 498 }, { "epoch": 0.69, "learning_rate": 1.798962701235289e-05, "loss": 1.0067, "step": 501 }, { "epoch": 0.69, "learning_rate": 1.796278747748988e-05, "loss": 1.0017, "step": 504 }, { "epoch": 0.7, "learning_rate": 1.7935790270563345e-05, "loss": 1.0086, "step": 507 }, { "epoch": 0.7, "learning_rate": 1.790863592614807e-05, "loss": 0.9884, "step": 510 }, { "epoch": 0.7, "learning_rate": 1.788132498193032e-05, "loss": 1.0028, "step": 513 }, { "epoch": 0.71, "learning_rate": 1.7853857978697223e-05, "loss": 1.0055, "step": 516 }, { "epoch": 0.71, "learning_rate": 1.7826235460326043e-05, "loss": 1.005, "step": 519 }, { "epoch": 0.72, "learning_rate": 1.7798457973773418e-05, "loss": 1.002, "step": 522 }, { "epoch": 0.72, "learning_rate": 1.7770526069064525e-05, "loss": 0.9838, "step": 525 }, { "epoch": 0.72, "learning_rate": 1.7742440299282203e-05, "loss": 1.001, "step": 528 }, { "epoch": 0.73, "learning_rate": 1.7714201220555982e-05, "loss": 0.9984, "step": 531 }, { "epoch": 0.73, "learning_rate": 1.7685809392051084e-05, "loss": 1.0035, "step": 534 }, { "epoch": 0.74, "learning_rate": 1.765726537595734e-05, "loss": 1.0076, "step": 537 }, { "epoch": 0.74, "learning_rate": 1.7628569737478076e-05, "loss": 0.9936, "step": 540 }, { "epoch": 0.75, "learning_rate": 1.7599723044818898e-05, "loss": 1.0053, "step": 543 }, { "epoch": 0.75, "learning_rate": 1.7570725869176468e-05, "loss": 0.9968, "step": 546 }, { "epoch": 0.75, "learning_rate": 1.7541578784727163e-05, "loss": 1.0059, "step": 549 }, { "epoch": 0.76, "learning_rate": 1.751228236861573e-05, "loss": 1.0059, "step": 552 }, { "epoch": 0.76, "learning_rate": 1.7482837200943845e-05, "loss": 1.0081, "step": 555 }, { "epoch": 0.77, "learning_rate": 1.7453243864758638e-05, "loss": 1.0215, "step": 558 }, { "epoch": 0.77, "learning_rate": 1.7423502946041133e-05, "loss": 0.9935, "step": 561 }, { "epoch": 0.77, "learning_rate": 1.739361503369466e-05, "loss": 0.9945, "step": 564 }, { "epoch": 0.78, "learning_rate": 1.7363580719533173e-05, "loss": 0.9926, "step": 567 }, { "epoch": 0.78, "learning_rate": 1.733340059826956e-05, "loss": 0.9946, "step": 570 }, { "epoch": 0.79, "learning_rate": 1.7303075267503845e-05, "loss": 1.0079, "step": 573 }, { "epoch": 0.79, "learning_rate": 1.7272605327711364e-05, "loss": 1.0212, "step": 576 }, { "epoch": 0.79, "learning_rate": 1.7241991382230872e-05, "loss": 0.993, "step": 579 }, { "epoch": 0.8, "learning_rate": 1.72112340372526e-05, "loss": 0.9843, "step": 582 }, { "epoch": 0.8, "learning_rate": 1.718033390180624e-05, "loss": 0.9837, "step": 585 }, { "epoch": 0.81, "learning_rate": 1.71492915877489e-05, "loss": 0.959, "step": 588 }, { "epoch": 0.81, "learning_rate": 1.7118107709752986e-05, "loss": 0.9895, "step": 591 }, { "epoch": 0.82, "learning_rate": 1.7086782885294026e-05, "loss": 0.99, "step": 594 }, { "epoch": 0.82, "learning_rate": 1.7055317734638444e-05, "loss": 1.006, "step": 597 }, { "epoch": 0.82, "learning_rate": 1.702371288083127e-05, "loss": 1.0009, "step": 600 }, { "epoch": 0.83, "learning_rate": 1.6991968949683835e-05, "loss": 0.9758, "step": 603 }, { "epoch": 0.83, "learning_rate": 1.6960086569761332e-05, "loss": 0.9801, "step": 606 }, { "epoch": 0.84, "learning_rate": 1.6928066372370407e-05, "loss": 0.9833, "step": 609 }, { "epoch": 0.84, "learning_rate": 1.689590899154664e-05, "loss": 0.9846, "step": 612 }, { "epoch": 0.84, "learning_rate": 1.6863615064042003e-05, "loss": 0.9752, "step": 615 }, { "epoch": 0.85, "learning_rate": 1.6831185229312237e-05, "loss": 0.9869, "step": 618 }, { "epoch": 0.85, "learning_rate": 1.67986201295042e-05, "loss": 0.9869, "step": 621 }, { "epoch": 0.86, "learning_rate": 1.676592040944315e-05, "loss": 0.9878, "step": 624 }, { "epoch": 0.86, "learning_rate": 1.6733086716619976e-05, "loss": 0.9938, "step": 627 }, { "epoch": 0.86, "learning_rate": 1.6700119701178378e-05, "loss": 1.0045, "step": 630 }, { "epoch": 0.87, "learning_rate": 1.666702001590199e-05, "loss": 1.0088, "step": 633 }, { "epoch": 0.87, "learning_rate": 1.6633788316201455e-05, "loss": 0.998, "step": 636 }, { "epoch": 0.88, "learning_rate": 1.6600425260101453e-05, "loss": 1.0017, "step": 639 }, { "epoch": 0.88, "learning_rate": 1.6566931508227663e-05, "loss": 0.9995, "step": 642 }, { "epoch": 0.89, "learning_rate": 1.6533307723793688e-05, "loss": 1.0012, "step": 645 }, { "epoch": 0.89, "learning_rate": 1.649955457258792e-05, "loss": 0.9807, "step": 648 }, { "epoch": 0.89, "learning_rate": 1.6465672722960365e-05, "loss": 0.9664, "step": 651 }, { "epoch": 0.9, "learning_rate": 1.6431662845809388e-05, "loss": 0.9707, "step": 654 }, { "epoch": 0.9, "learning_rate": 1.6397525614568446e-05, "loss": 0.983, "step": 657 }, { "epoch": 0.91, "learning_rate": 1.6363261705192757e-05, "loss": 1.0061, "step": 660 }, { "epoch": 0.91, "learning_rate": 1.6328871796145894e-05, "loss": 0.9899, "step": 663 }, { "epoch": 0.91, "learning_rate": 1.629435656838637e-05, "loss": 0.9795, "step": 666 }, { "epoch": 0.92, "learning_rate": 1.6259716705354154e-05, "loss": 1.0002, "step": 669 }, { "epoch": 0.92, "learning_rate": 1.6224952892957122e-05, "loss": 0.9837, "step": 672 }, { "epoch": 0.93, "learning_rate": 1.6190065819557496e-05, "loss": 0.9872, "step": 675 }, { "epoch": 0.93, "learning_rate": 1.615505617595819e-05, "loss": 0.9797, "step": 678 }, { "epoch": 0.93, "learning_rate": 1.6119924655389158e-05, "loss": 0.9926, "step": 681 }, { "epoch": 0.94, "learning_rate": 1.6084671953493645e-05, "loss": 0.9884, "step": 684 }, { "epoch": 0.94, "learning_rate": 1.6049298768314425e-05, "loss": 0.9918, "step": 687 }, { "epoch": 0.95, "learning_rate": 1.6013805800279977e-05, "loss": 0.9829, "step": 690 }, { "epoch": 0.95, "learning_rate": 1.5978193752190607e-05, "loss": 0.9854, "step": 693 }, { "epoch": 0.96, "learning_rate": 1.5942463329204546e-05, "loss": 0.9751, "step": 696 }, { "epoch": 0.96, "learning_rate": 1.5906615238823974e-05, "loss": 0.9945, "step": 699 }, { "epoch": 0.96, "learning_rate": 1.5870650190881023e-05, "loss": 0.9957, "step": 702 }, { "epoch": 0.97, "learning_rate": 1.583456889752371e-05, "loss": 1.0047, "step": 705 }, { "epoch": 0.97, "learning_rate": 1.579837207320184e-05, "loss": 0.9921, "step": 708 }, { "epoch": 0.98, "learning_rate": 1.5762060434652863e-05, "loss": 0.9839, "step": 711 }, { "epoch": 0.98, "learning_rate": 1.572563470088768e-05, "loss": 0.9922, "step": 714 }, { "epoch": 0.98, "learning_rate": 1.56890955931764e-05, "loss": 0.9752, "step": 717 }, { "epoch": 0.99, "learning_rate": 1.565244383503407e-05, "loss": 0.9778, "step": 720 }, { "epoch": 0.99, "learning_rate": 1.5615680152206324e-05, "loss": 0.9795, "step": 723 }, { "epoch": 1.0, "learning_rate": 1.557880527265505e-05, "loss": 0.9774, "step": 726 } ], "logging_steps": 3, "max_steps": 2184, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500.0, "total_flos": 4.694048596218085e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }