{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.222658667991288, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003912727828512476, "grad_norm": 8.57561206817627, "learning_rate": 1.55e-06, "loss": 5.0338, "step": 32 }, { "epoch": 0.007825455657024952, "grad_norm": 5.36021089553833, "learning_rate": 3.1500000000000003e-06, "loss": 4.456, "step": 64 }, { "epoch": 0.011738183485537427, "grad_norm": 3.3196067810058594, "learning_rate": 4.75e-06, "loss": 3.9216, "step": 96 }, { "epoch": 0.015650911314049904, "grad_norm": 2.2839956283569336, "learning_rate": 6.35e-06, "loss": 3.6983, "step": 128 }, { "epoch": 0.01956363914256238, "grad_norm": 1.4226499795913696, "learning_rate": 7.95e-06, "loss": 3.5863, "step": 160 }, { "epoch": 0.023476366971074854, "grad_norm": 0.9770936369895935, "learning_rate": 9.55e-06, "loss": 3.5076, "step": 192 }, { "epoch": 0.02738909479958733, "grad_norm": 0.6855128407478333, "learning_rate": 1.115e-05, "loss": 3.4515, "step": 224 }, { "epoch": 0.03130182262809981, "grad_norm": 0.5743525624275208, "learning_rate": 1.2750000000000002e-05, "loss": 3.4145, "step": 256 }, { "epoch": 0.03521455045661228, "grad_norm": 0.4765739440917969, "learning_rate": 1.435e-05, "loss": 3.3892, "step": 288 }, { "epoch": 0.03912727828512476, "grad_norm": 0.40247443318367004, "learning_rate": 1.595e-05, "loss": 3.3664, "step": 320 }, { "epoch": 0.04304000611363723, "grad_norm": 0.3582874834537506, "learning_rate": 1.755e-05, "loss": 3.3487, "step": 352 }, { "epoch": 0.04695273394214971, "grad_norm": 0.31657862663269043, "learning_rate": 1.915e-05, "loss": 3.3349, "step": 384 }, { "epoch": 0.05086546177066218, "grad_norm": 0.28206518292427063, "learning_rate": 2.075e-05, "loss": 3.3197, "step": 416 }, { "epoch": 0.05477818959917466, "grad_norm": 0.2577824890613556, "learning_rate": 2.235e-05, "loss": 3.3058, "step": 448 }, { "epoch": 0.05869091742768714, "grad_norm": 0.23786848783493042, "learning_rate": 2.395e-05, "loss": 3.2955, "step": 480 }, { "epoch": 0.06260364525619962, "grad_norm": 0.2239329218864441, "learning_rate": 2.555e-05, "loss": 3.2846, "step": 512 }, { "epoch": 0.06651637308471209, "grad_norm": 0.22519271075725555, "learning_rate": 2.7150000000000003e-05, "loss": 3.2731, "step": 544 }, { "epoch": 0.07042910091322456, "grad_norm": 0.2189016044139862, "learning_rate": 2.8749999999999997e-05, "loss": 3.2663, "step": 576 }, { "epoch": 0.07434182874173703, "grad_norm": 0.20760661363601685, "learning_rate": 3.035e-05, "loss": 3.2581, "step": 608 }, { "epoch": 0.07825455657024952, "grad_norm": 0.205606147646904, "learning_rate": 3.1950000000000004e-05, "loss": 3.2451, "step": 640 }, { "epoch": 0.08216728439876199, "grad_norm": 0.22558899223804474, "learning_rate": 3.355e-05, "loss": 3.2412, "step": 672 }, { "epoch": 0.08608001222727446, "grad_norm": 0.22584667801856995, "learning_rate": 3.515e-05, "loss": 3.2358, "step": 704 }, { "epoch": 0.08999274005578695, "grad_norm": 0.22091105580329895, "learning_rate": 3.675e-05, "loss": 3.2302, "step": 736 }, { "epoch": 0.09390546788429942, "grad_norm": 0.22428959608078003, "learning_rate": 3.8350000000000004e-05, "loss": 3.2228, "step": 768 }, { "epoch": 0.09781819571281189, "grad_norm": 0.22730223834514618, "learning_rate": 3.995e-05, "loss": 3.2207, "step": 800 }, { "epoch": 0.10173092354132436, "grad_norm": 0.28039082884788513, "learning_rate": 4.155e-05, "loss": 3.2171, "step": 832 }, { "epoch": 0.10564365136983685, "grad_norm": 0.32776346802711487, "learning_rate": 4.315e-05, "loss": 3.2104, "step": 864 }, { "epoch": 0.10955637919834932, "grad_norm": 0.2800813615322113, "learning_rate": 4.4750000000000004e-05, "loss": 3.2053, "step": 896 }, { "epoch": 0.11346910702686179, "grad_norm": 0.24571874737739563, "learning_rate": 4.635e-05, "loss": 3.2046, "step": 928 }, { "epoch": 0.11738183485537428, "grad_norm": 0.5581298470497131, "learning_rate": 4.795e-05, "loss": 3.2004, "step": 960 }, { "epoch": 0.12129456268388675, "grad_norm": 0.47118815779685974, "learning_rate": 4.9550000000000005e-05, "loss": 3.1967, "step": 992 }, { "epoch": 0.12520729051239923, "grad_norm": 0.23707512021064758, "learning_rate": 4.9872222222222225e-05, "loss": 3.1945, "step": 1024 }, { "epoch": 0.1291200183409117, "grad_norm": 0.41069141030311584, "learning_rate": 4.969444444444445e-05, "loss": 3.1928, "step": 1056 }, { "epoch": 0.13303274616942418, "grad_norm": 0.376223623752594, "learning_rate": 4.9516666666666666e-05, "loss": 3.1871, "step": 1088 }, { "epoch": 0.13694547399793663, "grad_norm": 0.22380244731903076, "learning_rate": 4.933888888888889e-05, "loss": 3.1862, "step": 1120 }, { "epoch": 0.14085820182644912, "grad_norm": 0.2950900197029114, "learning_rate": 4.9161111111111115e-05, "loss": 3.1828, "step": 1152 }, { "epoch": 0.1447709296549616, "grad_norm": 0.25872257351875305, "learning_rate": 4.8983333333333336e-05, "loss": 3.1828, "step": 1184 }, { "epoch": 0.14868365748347406, "grad_norm": 0.3597142994403839, "learning_rate": 4.880555555555556e-05, "loss": 3.1845, "step": 1216 }, { "epoch": 0.15259638531198655, "grad_norm": 0.30377593636512756, "learning_rate": 4.862777777777778e-05, "loss": 3.1806, "step": 1248 }, { "epoch": 0.15650911314049903, "grad_norm": 0.3617115318775177, "learning_rate": 4.845e-05, "loss": 3.178, "step": 1280 }, { "epoch": 0.1604218409690115, "grad_norm": 0.31589606404304504, "learning_rate": 4.8272222222222226e-05, "loss": 3.1787, "step": 1312 }, { "epoch": 0.16433456879752398, "grad_norm": 0.30715763568878174, "learning_rate": 4.809444444444445e-05, "loss": 3.1754, "step": 1344 }, { "epoch": 0.16824729662603646, "grad_norm": 0.2574257254600525, "learning_rate": 4.791666666666667e-05, "loss": 3.1732, "step": 1376 }, { "epoch": 0.17216002445454892, "grad_norm": 0.3290633261203766, "learning_rate": 4.773888888888889e-05, "loss": 3.1723, "step": 1408 }, { "epoch": 0.1760727522830614, "grad_norm": 0.24164608120918274, "learning_rate": 4.756111111111111e-05, "loss": 3.1693, "step": 1440 }, { "epoch": 0.1799854801115739, "grad_norm": 0.30125918984413147, "learning_rate": 4.738333333333334e-05, "loss": 3.1685, "step": 1472 }, { "epoch": 0.18389820794008635, "grad_norm": 0.3488104045391083, "learning_rate": 4.720555555555556e-05, "loss": 3.1678, "step": 1504 }, { "epoch": 0.18781093576859884, "grad_norm": 0.2793637812137604, "learning_rate": 4.702777777777778e-05, "loss": 3.1668, "step": 1536 }, { "epoch": 0.1917236635971113, "grad_norm": 0.2682870030403137, "learning_rate": 4.685000000000001e-05, "loss": 3.1642, "step": 1568 }, { "epoch": 0.19563639142562378, "grad_norm": 0.36307454109191895, "learning_rate": 4.667222222222222e-05, "loss": 3.1654, "step": 1600 }, { "epoch": 0.19954911925413626, "grad_norm": 0.23930683732032776, "learning_rate": 4.649444444444445e-05, "loss": 3.1641, "step": 1632 }, { "epoch": 0.20346184708264872, "grad_norm": 0.3049800992012024, "learning_rate": 4.631666666666667e-05, "loss": 3.1654, "step": 1664 }, { "epoch": 0.2073745749111612, "grad_norm": 0.27725374698638916, "learning_rate": 4.613888888888889e-05, "loss": 3.1642, "step": 1696 }, { "epoch": 0.2112873027396737, "grad_norm": 0.2733665108680725, "learning_rate": 4.596111111111112e-05, "loss": 3.1584, "step": 1728 }, { "epoch": 0.21520003056818615, "grad_norm": 0.34570956230163574, "learning_rate": 4.578333333333333e-05, "loss": 3.162, "step": 1760 }, { "epoch": 0.21911275839669864, "grad_norm": 0.2521582543849945, "learning_rate": 4.560555555555556e-05, "loss": 3.1603, "step": 1792 }, { "epoch": 0.22302548622521112, "grad_norm": 0.29344356060028076, "learning_rate": 4.542777777777778e-05, "loss": 3.1587, "step": 1824 }, { "epoch": 0.22693821405372358, "grad_norm": 0.426881343126297, "learning_rate": 4.525e-05, "loss": 3.1561, "step": 1856 }, { "epoch": 0.23085094188223607, "grad_norm": 0.27699196338653564, "learning_rate": 4.507222222222223e-05, "loss": 3.1581, "step": 1888 }, { "epoch": 0.23476366971074855, "grad_norm": 0.32313504815101624, "learning_rate": 4.4894444444444444e-05, "loss": 3.1578, "step": 1920 }, { "epoch": 0.238676397539261, "grad_norm": 0.26697778701782227, "learning_rate": 4.4716666666666665e-05, "loss": 3.157, "step": 1952 }, { "epoch": 0.2425891253677735, "grad_norm": 0.2206508368253708, "learning_rate": 4.453888888888889e-05, "loss": 3.1551, "step": 1984 }, { "epoch": 0.24650185319628595, "grad_norm": 0.252888947725296, "learning_rate": 4.4361111111111113e-05, "loss": 3.1563, "step": 2016 }, { "epoch": 0.25041458102479847, "grad_norm": 0.28254494071006775, "learning_rate": 4.4183333333333334e-05, "loss": 3.156, "step": 2048 }, { "epoch": 0.2543273088533109, "grad_norm": 0.28460440039634705, "learning_rate": 4.4005555555555555e-05, "loss": 3.156, "step": 2080 }, { "epoch": 0.2582400366818234, "grad_norm": 0.290326863527298, "learning_rate": 4.3827777777777776e-05, "loss": 3.1518, "step": 2112 }, { "epoch": 0.26215276451033587, "grad_norm": 0.2769670784473419, "learning_rate": 4.3650000000000004e-05, "loss": 3.1515, "step": 2144 }, { "epoch": 0.26606549233884835, "grad_norm": 0.21678052842617035, "learning_rate": 4.3472222222222225e-05, "loss": 3.1518, "step": 2176 }, { "epoch": 0.26997822016736084, "grad_norm": 0.3134085536003113, "learning_rate": 4.3294444444444446e-05, "loss": 3.1501, "step": 2208 }, { "epoch": 0.27389094799587327, "grad_norm": 0.35099807381629944, "learning_rate": 4.311666666666667e-05, "loss": 3.1523, "step": 2240 }, { "epoch": 0.27780367582438575, "grad_norm": 0.27320197224617004, "learning_rate": 4.293888888888889e-05, "loss": 3.1507, "step": 2272 }, { "epoch": 0.28171640365289824, "grad_norm": 0.28096139430999756, "learning_rate": 4.2761111111111115e-05, "loss": 3.1474, "step": 2304 }, { "epoch": 0.2856291314814107, "grad_norm": 0.30300965905189514, "learning_rate": 4.2583333333333336e-05, "loss": 3.15, "step": 2336 }, { "epoch": 0.2895418593099232, "grad_norm": 0.2996535003185272, "learning_rate": 4.240555555555556e-05, "loss": 3.1528, "step": 2368 }, { "epoch": 0.2934545871384357, "grad_norm": 0.2503749132156372, "learning_rate": 4.222777777777778e-05, "loss": 3.1522, "step": 2400 }, { "epoch": 0.2973673149669481, "grad_norm": 0.2272900640964508, "learning_rate": 4.205e-05, "loss": 3.1472, "step": 2432 }, { "epoch": 0.3012800427954606, "grad_norm": 0.2367839366197586, "learning_rate": 4.1872222222222227e-05, "loss": 3.1479, "step": 2464 }, { "epoch": 0.3051927706239731, "grad_norm": 0.3656509220600128, "learning_rate": 4.169444444444445e-05, "loss": 3.1506, "step": 2496 }, { "epoch": 0.3091054984524856, "grad_norm": 0.25474536418914795, "learning_rate": 4.151666666666667e-05, "loss": 3.1506, "step": 2528 }, { "epoch": 0.31301822628099807, "grad_norm": 0.21729741990566254, "learning_rate": 4.133888888888889e-05, "loss": 3.1466, "step": 2560 }, { "epoch": 0.31693095410951055, "grad_norm": 0.26999133825302124, "learning_rate": 4.116111111111111e-05, "loss": 3.1468, "step": 2592 }, { "epoch": 0.320843681938023, "grad_norm": 0.2668827176094055, "learning_rate": 4.098333333333334e-05, "loss": 3.144, "step": 2624 }, { "epoch": 0.32475640976653547, "grad_norm": 0.24051733314990997, "learning_rate": 4.080555555555556e-05, "loss": 3.1465, "step": 2656 }, { "epoch": 0.32866913759504796, "grad_norm": 0.24717700481414795, "learning_rate": 4.062777777777778e-05, "loss": 3.1465, "step": 2688 }, { "epoch": 0.33258186542356044, "grad_norm": 0.23907746374607086, "learning_rate": 4.045000000000001e-05, "loss": 3.1453, "step": 2720 }, { "epoch": 0.3364945932520729, "grad_norm": 0.24447326362133026, "learning_rate": 4.027222222222222e-05, "loss": 3.1406, "step": 2752 }, { "epoch": 0.34040732108058536, "grad_norm": 0.25871723890304565, "learning_rate": 4.009444444444444e-05, "loss": 3.1435, "step": 2784 }, { "epoch": 0.34432004890909784, "grad_norm": 0.3173305094242096, "learning_rate": 3.991666666666667e-05, "loss": 3.1439, "step": 2816 }, { "epoch": 0.34823277673761033, "grad_norm": 0.2715188264846802, "learning_rate": 3.973888888888889e-05, "loss": 3.1433, "step": 2848 }, { "epoch": 0.3521455045661228, "grad_norm": 0.2764374315738678, "learning_rate": 3.956111111111112e-05, "loss": 3.1455, "step": 2880 }, { "epoch": 0.3560582323946353, "grad_norm": 0.3014623522758484, "learning_rate": 3.938333333333333e-05, "loss": 3.1399, "step": 2912 }, { "epoch": 0.3599709602231478, "grad_norm": 0.22385312616825104, "learning_rate": 3.9205555555555554e-05, "loss": 3.1426, "step": 2944 }, { "epoch": 0.3638836880516602, "grad_norm": 0.22400549054145813, "learning_rate": 3.902777777777778e-05, "loss": 3.1393, "step": 2976 }, { "epoch": 0.3677964158801727, "grad_norm": 0.266812801361084, "learning_rate": 3.885e-05, "loss": 3.1426, "step": 3008 }, { "epoch": 0.3717091437086852, "grad_norm": 0.2830856442451477, "learning_rate": 3.867222222222222e-05, "loss": 3.14, "step": 3040 }, { "epoch": 0.37562187153719767, "grad_norm": 0.2724515199661255, "learning_rate": 3.8494444444444444e-05, "loss": 3.1419, "step": 3072 }, { "epoch": 0.37953459936571016, "grad_norm": 0.22998973727226257, "learning_rate": 3.8316666666666665e-05, "loss": 3.139, "step": 3104 }, { "epoch": 0.3834473271942226, "grad_norm": 0.23931734263896942, "learning_rate": 3.813888888888889e-05, "loss": 3.1408, "step": 3136 }, { "epoch": 0.3873600550227351, "grad_norm": 0.26907482743263245, "learning_rate": 3.7961111111111114e-05, "loss": 3.1374, "step": 3168 }, { "epoch": 0.39127278285124756, "grad_norm": 0.24700401723384857, "learning_rate": 3.7783333333333335e-05, "loss": 3.137, "step": 3200 }, { "epoch": 0.39518551067976004, "grad_norm": 0.2963546812534332, "learning_rate": 3.7605555555555556e-05, "loss": 3.1401, "step": 3232 }, { "epoch": 0.39909823850827253, "grad_norm": 0.2659439444541931, "learning_rate": 3.7427777777777777e-05, "loss": 3.1387, "step": 3264 }, { "epoch": 0.403010966336785, "grad_norm": 0.26796412467956543, "learning_rate": 3.7250000000000004e-05, "loss": 3.1403, "step": 3296 }, { "epoch": 0.40692369416529744, "grad_norm": 0.29361388087272644, "learning_rate": 3.7072222222222225e-05, "loss": 3.1389, "step": 3328 }, { "epoch": 0.41083642199380993, "grad_norm": 0.24953944981098175, "learning_rate": 3.6894444444444446e-05, "loss": 3.1402, "step": 3360 }, { "epoch": 0.4147491498223224, "grad_norm": 0.23955155909061432, "learning_rate": 3.671666666666667e-05, "loss": 3.1377, "step": 3392 }, { "epoch": 0.4186618776508349, "grad_norm": 0.22984126210212708, "learning_rate": 3.653888888888889e-05, "loss": 3.1375, "step": 3424 }, { "epoch": 0.4225746054793474, "grad_norm": 0.2523467540740967, "learning_rate": 3.6361111111111116e-05, "loss": 3.1364, "step": 3456 }, { "epoch": 0.4264873333078598, "grad_norm": 0.23315957188606262, "learning_rate": 3.6183333333333336e-05, "loss": 3.1389, "step": 3488 }, { "epoch": 0.4304000611363723, "grad_norm": 0.22483432292938232, "learning_rate": 3.600555555555556e-05, "loss": 3.1357, "step": 3520 }, { "epoch": 0.4343127889648848, "grad_norm": 0.23685774207115173, "learning_rate": 3.582777777777778e-05, "loss": 3.136, "step": 3552 }, { "epoch": 0.4382255167933973, "grad_norm": 0.24475786089897156, "learning_rate": 3.565e-05, "loss": 3.1364, "step": 3584 }, { "epoch": 0.44213824462190976, "grad_norm": 0.21655669808387756, "learning_rate": 3.547222222222222e-05, "loss": 3.1363, "step": 3616 }, { "epoch": 0.44605097245042225, "grad_norm": 0.24810287356376648, "learning_rate": 3.529444444444445e-05, "loss": 3.1364, "step": 3648 }, { "epoch": 0.4499637002789347, "grad_norm": 0.23016402125358582, "learning_rate": 3.511666666666667e-05, "loss": 3.1345, "step": 3680 }, { "epoch": 0.45387642810744716, "grad_norm": 0.24041368067264557, "learning_rate": 3.4938888888888896e-05, "loss": 3.1389, "step": 3712 }, { "epoch": 0.45778915593595965, "grad_norm": 0.237365260720253, "learning_rate": 3.476111111111111e-05, "loss": 3.1335, "step": 3744 }, { "epoch": 0.46170188376447213, "grad_norm": 0.21840572357177734, "learning_rate": 3.458333333333333e-05, "loss": 3.1365, "step": 3776 }, { "epoch": 0.4656146115929846, "grad_norm": 0.22491848468780518, "learning_rate": 3.440555555555556e-05, "loss": 3.1365, "step": 3808 }, { "epoch": 0.4695273394214971, "grad_norm": 0.2349662482738495, "learning_rate": 3.422777777777778e-05, "loss": 3.1364, "step": 3840 }, { "epoch": 0.47344006725000953, "grad_norm": 0.3244574964046478, "learning_rate": 3.405e-05, "loss": 3.1333, "step": 3872 }, { "epoch": 0.477352795078522, "grad_norm": 0.20271480083465576, "learning_rate": 3.387222222222222e-05, "loss": 3.1337, "step": 3904 }, { "epoch": 0.4812655229070345, "grad_norm": 0.22787164151668549, "learning_rate": 3.369444444444444e-05, "loss": 3.1359, "step": 3936 }, { "epoch": 0.485178250735547, "grad_norm": 0.2814686894416809, "learning_rate": 3.351666666666667e-05, "loss": 3.1344, "step": 3968 }, { "epoch": 0.4890909785640595, "grad_norm": 0.20366469025611877, "learning_rate": 3.333888888888889e-05, "loss": 3.1342, "step": 4000 }, { "epoch": 0.4930037063925719, "grad_norm": 0.2670027017593384, "learning_rate": 3.316111111111111e-05, "loss": 3.1319, "step": 4032 }, { "epoch": 0.4969164342210844, "grad_norm": 0.2204466164112091, "learning_rate": 3.298333333333333e-05, "loss": 3.1328, "step": 4064 }, { "epoch": 0.5008291620495969, "grad_norm": 0.2765197157859802, "learning_rate": 3.2805555555555554e-05, "loss": 3.132, "step": 4096 }, { "epoch": 0.5047418898781093, "grad_norm": 0.2624960243701935, "learning_rate": 3.262777777777778e-05, "loss": 3.1348, "step": 4128 }, { "epoch": 0.5086546177066218, "grad_norm": 0.2254333347082138, "learning_rate": 3.245e-05, "loss": 3.1327, "step": 4160 }, { "epoch": 0.5125673455351343, "grad_norm": 0.25047773122787476, "learning_rate": 3.2272222222222224e-05, "loss": 3.1318, "step": 4192 }, { "epoch": 0.5164800733636468, "grad_norm": 0.23816271126270294, "learning_rate": 3.2094444444444445e-05, "loss": 3.1331, "step": 4224 }, { "epoch": 0.5203928011921592, "grad_norm": 0.22233732044696808, "learning_rate": 3.1916666666666665e-05, "loss": 3.1315, "step": 4256 }, { "epoch": 0.5243055290206717, "grad_norm": 0.25133851170539856, "learning_rate": 3.173888888888889e-05, "loss": 3.1333, "step": 4288 }, { "epoch": 0.5282182568491842, "grad_norm": 0.21504898369312286, "learning_rate": 3.1561111111111114e-05, "loss": 3.1332, "step": 4320 }, { "epoch": 0.5321309846776967, "grad_norm": 0.2872157394886017, "learning_rate": 3.1383333333333335e-05, "loss": 3.1303, "step": 4352 }, { "epoch": 0.5360437125062092, "grad_norm": 0.244154691696167, "learning_rate": 3.1205555555555556e-05, "loss": 3.1323, "step": 4384 }, { "epoch": 0.5399564403347217, "grad_norm": 0.24791453778743744, "learning_rate": 3.102777777777778e-05, "loss": 3.1312, "step": 4416 }, { "epoch": 0.5438691681632342, "grad_norm": 0.2378605306148529, "learning_rate": 3.0850000000000004e-05, "loss": 3.1309, "step": 4448 }, { "epoch": 0.5477818959917465, "grad_norm": 0.21514585614204407, "learning_rate": 3.0672222222222225e-05, "loss": 3.1244, "step": 4480 }, { "epoch": 0.551694623820259, "grad_norm": 0.22684329748153687, "learning_rate": 3.0494444444444446e-05, "loss": 3.1297, "step": 4512 }, { "epoch": 0.5556073516487715, "grad_norm": 0.21271203458309174, "learning_rate": 3.0316666666666664e-05, "loss": 3.1286, "step": 4544 }, { "epoch": 0.559520079477284, "grad_norm": 0.22873900830745697, "learning_rate": 3.0138888888888888e-05, "loss": 3.1262, "step": 4576 }, { "epoch": 0.5634328073057965, "grad_norm": 0.24229228496551514, "learning_rate": 2.9961111111111112e-05, "loss": 3.1312, "step": 4608 }, { "epoch": 0.567345535134309, "grad_norm": 0.2754037380218506, "learning_rate": 2.9783333333333337e-05, "loss": 3.1296, "step": 4640 }, { "epoch": 0.5712582629628215, "grad_norm": 0.20053815841674805, "learning_rate": 2.9605555555555558e-05, "loss": 3.128, "step": 4672 }, { "epoch": 0.5751709907913339, "grad_norm": 0.24577876925468445, "learning_rate": 2.9427777777777782e-05, "loss": 3.1302, "step": 4704 }, { "epoch": 0.5790837186198464, "grad_norm": 0.2547786235809326, "learning_rate": 2.925e-05, "loss": 3.1263, "step": 4736 }, { "epoch": 0.5829964464483589, "grad_norm": 0.18451441824436188, "learning_rate": 2.9072222222222224e-05, "loss": 3.1282, "step": 4768 }, { "epoch": 0.5869091742768714, "grad_norm": 0.21002881228923798, "learning_rate": 2.8894444444444445e-05, "loss": 3.1271, "step": 4800 }, { "epoch": 0.5908219021053838, "grad_norm": 0.21180187165737152, "learning_rate": 2.871666666666667e-05, "loss": 3.1272, "step": 4832 }, { "epoch": 0.5947346299338963, "grad_norm": 0.2123003453016281, "learning_rate": 2.8538888888888893e-05, "loss": 3.1285, "step": 4864 }, { "epoch": 0.5986473577624087, "grad_norm": 0.20064932107925415, "learning_rate": 2.836111111111111e-05, "loss": 3.1289, "step": 4896 }, { "epoch": 0.6025600855909212, "grad_norm": 0.19583889842033386, "learning_rate": 2.8183333333333335e-05, "loss": 3.128, "step": 4928 }, { "epoch": 0.6064728134194337, "grad_norm": 0.1817025989294052, "learning_rate": 2.8005555555555556e-05, "loss": 3.1263, "step": 4960 }, { "epoch": 0.6103855412479462, "grad_norm": 0.18323124945163727, "learning_rate": 2.782777777777778e-05, "loss": 3.1276, "step": 4992 }, { "epoch": 0.6142982690764587, "grad_norm": 0.21348968148231506, "learning_rate": 2.7650000000000005e-05, "loss": 3.1262, "step": 5024 }, { "epoch": 0.6182109969049712, "grad_norm": 0.24803143739700317, "learning_rate": 2.7472222222222222e-05, "loss": 3.1278, "step": 5056 }, { "epoch": 0.6221237247334837, "grad_norm": 0.27887552976608276, "learning_rate": 2.7294444444444443e-05, "loss": 3.1261, "step": 5088 }, { "epoch": 0.6260364525619961, "grad_norm": 0.20992670953273773, "learning_rate": 2.7116666666666667e-05, "loss": 3.1248, "step": 5120 }, { "epoch": 0.6299491803905086, "grad_norm": 0.20632390677928925, "learning_rate": 2.693888888888889e-05, "loss": 3.1295, "step": 5152 }, { "epoch": 0.6338619082190211, "grad_norm": 0.22720162570476532, "learning_rate": 2.6761111111111116e-05, "loss": 3.124, "step": 5184 }, { "epoch": 0.6377746360475335, "grad_norm": 0.20604351162910461, "learning_rate": 2.6583333333333333e-05, "loss": 3.1246, "step": 5216 }, { "epoch": 0.641687363876046, "grad_norm": 0.21567173302173615, "learning_rate": 2.6405555555555554e-05, "loss": 3.1266, "step": 5248 }, { "epoch": 0.6456000917045585, "grad_norm": 0.22443106770515442, "learning_rate": 2.622777777777778e-05, "loss": 3.1265, "step": 5280 }, { "epoch": 0.6495128195330709, "grad_norm": 0.2323237955570221, "learning_rate": 2.6050000000000003e-05, "loss": 3.1214, "step": 5312 }, { "epoch": 0.6534255473615834, "grad_norm": 0.21166770160198212, "learning_rate": 2.5872222222222224e-05, "loss": 3.125, "step": 5344 }, { "epoch": 0.6573382751900959, "grad_norm": 0.21922937035560608, "learning_rate": 2.5694444444444445e-05, "loss": 3.1236, "step": 5376 }, { "epoch": 0.6612510030186084, "grad_norm": 0.19853883981704712, "learning_rate": 2.5516666666666666e-05, "loss": 3.1256, "step": 5408 }, { "epoch": 0.6651637308471209, "grad_norm": 0.22357633709907532, "learning_rate": 2.533888888888889e-05, "loss": 3.1257, "step": 5440 }, { "epoch": 0.6690764586756334, "grad_norm": 0.22123898565769196, "learning_rate": 2.5161111111111114e-05, "loss": 3.1265, "step": 5472 }, { "epoch": 0.6729891865041459, "grad_norm": 0.20758691430091858, "learning_rate": 2.4983333333333335e-05, "loss": 3.1244, "step": 5504 }, { "epoch": 0.6769019143326583, "grad_norm": 0.19084863364696503, "learning_rate": 2.4805555555555556e-05, "loss": 3.124, "step": 5536 }, { "epoch": 0.6808146421611707, "grad_norm": 0.21082304418087006, "learning_rate": 2.462777777777778e-05, "loss": 3.1247, "step": 5568 }, { "epoch": 0.6847273699896832, "grad_norm": 0.19547946751117706, "learning_rate": 2.445e-05, "loss": 3.1254, "step": 5600 }, { "epoch": 0.6886400978181957, "grad_norm": 0.20289190113544464, "learning_rate": 2.4272222222222222e-05, "loss": 3.1274, "step": 5632 }, { "epoch": 0.6925528256467082, "grad_norm": 0.21069744229316711, "learning_rate": 2.4094444444444443e-05, "loss": 3.1235, "step": 5664 }, { "epoch": 0.6964655534752207, "grad_norm": 0.20337700843811035, "learning_rate": 2.3916666666666668e-05, "loss": 3.1253, "step": 5696 }, { "epoch": 0.7003782813037331, "grad_norm": 0.2150067836046219, "learning_rate": 2.3738888888888892e-05, "loss": 3.1255, "step": 5728 }, { "epoch": 0.7042910091322456, "grad_norm": 0.1990475058555603, "learning_rate": 2.3561111111111113e-05, "loss": 3.1247, "step": 5760 }, { "epoch": 0.7082037369607581, "grad_norm": 0.20272456109523773, "learning_rate": 2.3383333333333334e-05, "loss": 3.1235, "step": 5792 }, { "epoch": 0.7121164647892706, "grad_norm": 0.21050025522708893, "learning_rate": 2.3205555555555555e-05, "loss": 3.1226, "step": 5824 }, { "epoch": 0.7160291926177831, "grad_norm": 0.2530113160610199, "learning_rate": 2.302777777777778e-05, "loss": 3.1242, "step": 5856 }, { "epoch": 0.7199419204462956, "grad_norm": 0.2530890703201294, "learning_rate": 2.2850000000000003e-05, "loss": 3.1215, "step": 5888 }, { "epoch": 0.7238546482748079, "grad_norm": 0.19028717279434204, "learning_rate": 2.2672222222222224e-05, "loss": 3.1236, "step": 5920 }, { "epoch": 0.7277673761033204, "grad_norm": 0.20547839999198914, "learning_rate": 2.2494444444444445e-05, "loss": 3.1225, "step": 5952 }, { "epoch": 0.7316801039318329, "grad_norm": 0.19479484856128693, "learning_rate": 2.231666666666667e-05, "loss": 3.1248, "step": 5984 }, { "epoch": 0.7355928317603454, "grad_norm": 0.2140408456325531, "learning_rate": 2.213888888888889e-05, "loss": 3.1237, "step": 6016 }, { "epoch": 0.7395055595888579, "grad_norm": 0.17809583246707916, "learning_rate": 2.1961111111111114e-05, "loss": 3.1243, "step": 6048 }, { "epoch": 0.7434182874173704, "grad_norm": 0.19468888640403748, "learning_rate": 2.1783333333333332e-05, "loss": 3.1246, "step": 6080 }, { "epoch": 0.7473310152458829, "grad_norm": 0.2106105089187622, "learning_rate": 2.1605555555555556e-05, "loss": 3.1224, "step": 6112 }, { "epoch": 0.7512437430743953, "grad_norm": 0.20489418506622314, "learning_rate": 2.142777777777778e-05, "loss": 3.1237, "step": 6144 }, { "epoch": 0.7551564709029078, "grad_norm": 0.2453160136938095, "learning_rate": 2.125e-05, "loss": 3.1212, "step": 6176 }, { "epoch": 0.7590691987314203, "grad_norm": 0.2121828943490982, "learning_rate": 2.1072222222222222e-05, "loss": 3.1192, "step": 6208 }, { "epoch": 0.7629819265599328, "grad_norm": 0.18198275566101074, "learning_rate": 2.0894444444444443e-05, "loss": 3.1213, "step": 6240 }, { "epoch": 0.7668946543884452, "grad_norm": 0.1795693039894104, "learning_rate": 2.0716666666666668e-05, "loss": 3.1201, "step": 6272 }, { "epoch": 0.7708073822169577, "grad_norm": 0.24014544486999512, "learning_rate": 2.0538888888888892e-05, "loss": 3.122, "step": 6304 }, { "epoch": 0.7747201100454701, "grad_norm": 0.20040743052959442, "learning_rate": 2.0361111111111113e-05, "loss": 3.1207, "step": 6336 }, { "epoch": 0.7786328378739826, "grad_norm": 0.2076857089996338, "learning_rate": 2.0183333333333334e-05, "loss": 3.1245, "step": 6368 }, { "epoch": 0.7825455657024951, "grad_norm": 0.19411978125572205, "learning_rate": 2.0005555555555555e-05, "loss": 3.1216, "step": 6400 }, { "epoch": 0.7864582935310076, "grad_norm": 0.17701873183250427, "learning_rate": 1.982777777777778e-05, "loss": 3.1228, "step": 6432 }, { "epoch": 0.7903710213595201, "grad_norm": 0.19787663221359253, "learning_rate": 1.9650000000000003e-05, "loss": 3.122, "step": 6464 }, { "epoch": 0.7942837491880326, "grad_norm": 0.18991973996162415, "learning_rate": 1.947222222222222e-05, "loss": 3.1211, "step": 6496 }, { "epoch": 0.7981964770165451, "grad_norm": 0.18508349359035492, "learning_rate": 1.9294444444444445e-05, "loss": 3.1211, "step": 6528 }, { "epoch": 0.8021092048450575, "grad_norm": 0.17648939788341522, "learning_rate": 1.911666666666667e-05, "loss": 3.1237, "step": 6560 }, { "epoch": 0.80602193267357, "grad_norm": 0.20672652125358582, "learning_rate": 1.893888888888889e-05, "loss": 3.1213, "step": 6592 }, { "epoch": 0.8099346605020824, "grad_norm": 0.21490968763828278, "learning_rate": 1.876111111111111e-05, "loss": 3.1201, "step": 6624 }, { "epoch": 0.8138473883305949, "grad_norm": 0.20175087451934814, "learning_rate": 1.8583333333333332e-05, "loss": 3.1184, "step": 6656 }, { "epoch": 0.8177601161591074, "grad_norm": 0.17700786888599396, "learning_rate": 1.8405555555555556e-05, "loss": 3.1194, "step": 6688 }, { "epoch": 0.8216728439876199, "grad_norm": 0.19697381556034088, "learning_rate": 1.822777777777778e-05, "loss": 3.1208, "step": 6720 }, { "epoch": 0.8255855718161323, "grad_norm": 0.19516746699810028, "learning_rate": 1.805e-05, "loss": 3.122, "step": 6752 }, { "epoch": 0.8294982996446448, "grad_norm": 0.19233250617980957, "learning_rate": 1.7872222222222223e-05, "loss": 3.1237, "step": 6784 }, { "epoch": 0.8334110274731573, "grad_norm": 0.20740792155265808, "learning_rate": 1.7694444444444443e-05, "loss": 3.1227, "step": 6816 }, { "epoch": 0.8373237553016698, "grad_norm": 0.18789739906787872, "learning_rate": 1.7516666666666668e-05, "loss": 3.1198, "step": 6848 }, { "epoch": 0.8412364831301823, "grad_norm": 0.17981740832328796, "learning_rate": 1.7338888888888892e-05, "loss": 3.121, "step": 6880 }, { "epoch": 0.8451492109586948, "grad_norm": 0.2110264003276825, "learning_rate": 1.716111111111111e-05, "loss": 3.1186, "step": 6912 }, { "epoch": 0.8490619387872073, "grad_norm": 0.19858282804489136, "learning_rate": 1.6983333333333334e-05, "loss": 3.1236, "step": 6944 }, { "epoch": 0.8529746666157196, "grad_norm": 0.17566311359405518, "learning_rate": 1.6805555555555558e-05, "loss": 3.1225, "step": 6976 }, { "epoch": 0.8568873944442321, "grad_norm": 0.19274671375751495, "learning_rate": 1.662777777777778e-05, "loss": 3.1197, "step": 7008 }, { "epoch": 0.8608001222727446, "grad_norm": 0.20043255388736725, "learning_rate": 1.645e-05, "loss": 3.1221, "step": 7040 }, { "epoch": 0.8647128501012571, "grad_norm": 0.17369119822978973, "learning_rate": 1.627222222222222e-05, "loss": 3.119, "step": 7072 }, { "epoch": 0.8686255779297696, "grad_norm": 0.18795572221279144, "learning_rate": 1.6094444444444445e-05, "loss": 3.116, "step": 7104 }, { "epoch": 0.8725383057582821, "grad_norm": 0.20084317028522491, "learning_rate": 1.591666666666667e-05, "loss": 3.1164, "step": 7136 }, { "epoch": 0.8764510335867945, "grad_norm": 0.1732749342918396, "learning_rate": 1.573888888888889e-05, "loss": 3.1184, "step": 7168 }, { "epoch": 0.880363761415307, "grad_norm": 0.18775592744350433, "learning_rate": 1.556111111111111e-05, "loss": 3.1186, "step": 7200 }, { "epoch": 0.8842764892438195, "grad_norm": 0.1810338944196701, "learning_rate": 1.5383333333333332e-05, "loss": 3.1211, "step": 7232 }, { "epoch": 0.888189217072332, "grad_norm": 0.17264607548713684, "learning_rate": 1.5205555555555557e-05, "loss": 3.115, "step": 7264 }, { "epoch": 0.8921019449008445, "grad_norm": 0.18331947922706604, "learning_rate": 1.502777777777778e-05, "loss": 3.1176, "step": 7296 }, { "epoch": 0.896014672729357, "grad_norm": 0.1883401870727539, "learning_rate": 1.485e-05, "loss": 3.1194, "step": 7328 }, { "epoch": 0.8999274005578694, "grad_norm": 0.17407892644405365, "learning_rate": 1.4672222222222223e-05, "loss": 3.1188, "step": 7360 }, { "epoch": 0.9038401283863818, "grad_norm": 0.1941099464893341, "learning_rate": 1.4494444444444444e-05, "loss": 3.1211, "step": 7392 }, { "epoch": 0.9077528562148943, "grad_norm": 0.17381389439105988, "learning_rate": 1.4316666666666668e-05, "loss": 3.1194, "step": 7424 }, { "epoch": 0.9116655840434068, "grad_norm": 0.18369047343730927, "learning_rate": 1.413888888888889e-05, "loss": 3.1165, "step": 7456 }, { "epoch": 0.9155783118719193, "grad_norm": 0.17392371594905853, "learning_rate": 1.3961111111111111e-05, "loss": 3.1165, "step": 7488 }, { "epoch": 0.9194910397004318, "grad_norm": 0.17337463796138763, "learning_rate": 1.3783333333333334e-05, "loss": 3.1192, "step": 7520 }, { "epoch": 0.9234037675289443, "grad_norm": 0.1813974380493164, "learning_rate": 1.3605555555555557e-05, "loss": 3.1158, "step": 7552 }, { "epoch": 0.9273164953574567, "grad_norm": 0.1770683377981186, "learning_rate": 1.3427777777777778e-05, "loss": 3.1173, "step": 7584 }, { "epoch": 0.9312292231859692, "grad_norm": 0.18390090763568878, "learning_rate": 1.3250000000000002e-05, "loss": 3.1211, "step": 7616 }, { "epoch": 0.9351419510144817, "grad_norm": 0.17356765270233154, "learning_rate": 1.3072222222222221e-05, "loss": 3.1187, "step": 7648 }, { "epoch": 0.9390546788429942, "grad_norm": 0.173334538936615, "learning_rate": 1.2894444444444445e-05, "loss": 3.1191, "step": 7680 }, { "epoch": 0.9429674066715066, "grad_norm": 0.18598856031894684, "learning_rate": 1.2716666666666668e-05, "loss": 3.1192, "step": 7712 }, { "epoch": 0.9468801345000191, "grad_norm": 0.1667858213186264, "learning_rate": 1.2538888888888889e-05, "loss": 3.1173, "step": 7744 }, { "epoch": 0.9507928623285316, "grad_norm": 0.17433424293994904, "learning_rate": 1.2361111111111112e-05, "loss": 3.1184, "step": 7776 }, { "epoch": 0.954705590157044, "grad_norm": 0.1921132653951645, "learning_rate": 1.2183333333333334e-05, "loss": 3.119, "step": 7808 }, { "epoch": 0.9586183179855565, "grad_norm": 0.16437648236751556, "learning_rate": 1.2005555555555557e-05, "loss": 3.1179, "step": 7840 }, { "epoch": 0.962531045814069, "grad_norm": 0.17323090136051178, "learning_rate": 1.1827777777777778e-05, "loss": 3.1192, "step": 7872 }, { "epoch": 0.9664437736425815, "grad_norm": 0.16646146774291992, "learning_rate": 1.1650000000000002e-05, "loss": 3.1176, "step": 7904 }, { "epoch": 0.970356501471094, "grad_norm": 0.18198241293430328, "learning_rate": 1.1472222222222223e-05, "loss": 3.1178, "step": 7936 }, { "epoch": 0.9742692292996065, "grad_norm": 0.17490531504154205, "learning_rate": 1.1294444444444445e-05, "loss": 3.1161, "step": 7968 }, { "epoch": 0.978181957128119, "grad_norm": 0.17505322396755219, "learning_rate": 1.1116666666666666e-05, "loss": 3.1213, "step": 8000 }, { "epoch": 0.9820946849566314, "grad_norm": 0.17005711793899536, "learning_rate": 1.0938888888888889e-05, "loss": 3.1187, "step": 8032 }, { "epoch": 0.9860074127851438, "grad_norm": 0.18125712871551514, "learning_rate": 1.0761111111111112e-05, "loss": 3.12, "step": 8064 }, { "epoch": 0.9899201406136563, "grad_norm": 0.17013822495937347, "learning_rate": 1.0583333333333334e-05, "loss": 3.1157, "step": 8096 }, { "epoch": 0.9938328684421688, "grad_norm": 0.1698048710823059, "learning_rate": 1.0405555555555555e-05, "loss": 3.1172, "step": 8128 }, { "epoch": 0.9977455962706813, "grad_norm": 0.17143802344799042, "learning_rate": 1.0227777777777778e-05, "loss": 3.1153, "step": 8160 }, { "epoch": 1.0015895456803332, "grad_norm": 0.1739780455827713, "learning_rate": 1.005e-05, "loss": 3.1163, "step": 8192 }, { "epoch": 1.0055022735088457, "grad_norm": 0.17907440662384033, "learning_rate": 9.872222222222223e-06, "loss": 3.1143, "step": 8224 }, { "epoch": 1.0094150013373582, "grad_norm": 0.17365169525146484, "learning_rate": 9.694444444444446e-06, "loss": 3.1157, "step": 8256 }, { "epoch": 1.0133277291658707, "grad_norm": 0.1645737588405609, "learning_rate": 9.516666666666666e-06, "loss": 3.1134, "step": 8288 }, { "epoch": 1.0172404569943831, "grad_norm": 0.15174245834350586, "learning_rate": 9.338888888888889e-06, "loss": 3.1142, "step": 8320 }, { "epoch": 1.0211531848228956, "grad_norm": 0.16984011232852936, "learning_rate": 9.161111111111112e-06, "loss": 3.1142, "step": 8352 }, { "epoch": 1.0250659126514081, "grad_norm": 0.1772463321685791, "learning_rate": 8.983333333333334e-06, "loss": 3.1178, "step": 8384 }, { "epoch": 1.0289786404799206, "grad_norm": 0.16304141283035278, "learning_rate": 8.805555555555555e-06, "loss": 3.113, "step": 8416 }, { "epoch": 1.032891368308433, "grad_norm": 0.15513816475868225, "learning_rate": 8.627777777777778e-06, "loss": 3.1145, "step": 8448 }, { "epoch": 1.0368040961369456, "grad_norm": 0.1862088292837143, "learning_rate": 8.45e-06, "loss": 3.1109, "step": 8480 }, { "epoch": 1.0407168239654578, "grad_norm": 0.17995817959308624, "learning_rate": 8.272222222222223e-06, "loss": 3.1128, "step": 8512 }, { "epoch": 1.0446295517939703, "grad_norm": 0.1758676916360855, "learning_rate": 8.094444444444444e-06, "loss": 3.1128, "step": 8544 }, { "epoch": 1.0485422796224828, "grad_norm": 0.16609688103199005, "learning_rate": 7.916666666666667e-06, "loss": 3.114, "step": 8576 }, { "epoch": 1.0524550074509953, "grad_norm": 0.15258896350860596, "learning_rate": 7.738888888888889e-06, "loss": 3.1171, "step": 8608 }, { "epoch": 1.0563677352795078, "grad_norm": 0.16240954399108887, "learning_rate": 7.561111111111112e-06, "loss": 3.113, "step": 8640 }, { "epoch": 1.0602804631080203, "grad_norm": 0.16423362493515015, "learning_rate": 7.3833333333333335e-06, "loss": 3.1154, "step": 8672 }, { "epoch": 1.0641931909365328, "grad_norm": 0.17032068967819214, "learning_rate": 7.205555555555555e-06, "loss": 3.1146, "step": 8704 }, { "epoch": 1.0681059187650452, "grad_norm": 0.1564359813928604, "learning_rate": 7.027777777777778e-06, "loss": 3.1162, "step": 8736 }, { "epoch": 1.0720186465935577, "grad_norm": 0.15838623046875, "learning_rate": 6.8500000000000005e-06, "loss": 3.113, "step": 8768 }, { "epoch": 1.0759313744220702, "grad_norm": 0.17325465381145477, "learning_rate": 6.672222222222223e-06, "loss": 3.1153, "step": 8800 }, { "epoch": 1.0798441022505827, "grad_norm": 0.16170760989189148, "learning_rate": 6.494444444444445e-06, "loss": 3.115, "step": 8832 }, { "epoch": 1.0837568300790952, "grad_norm": 0.15591956675052643, "learning_rate": 6.316666666666667e-06, "loss": 3.1088, "step": 8864 }, { "epoch": 1.0876695579076077, "grad_norm": 0.15115121006965637, "learning_rate": 6.138888888888889e-06, "loss": 3.1103, "step": 8896 }, { "epoch": 1.0915822857361202, "grad_norm": 0.1577509045600891, "learning_rate": 5.961111111111111e-06, "loss": 3.112, "step": 8928 }, { "epoch": 1.0954950135646326, "grad_norm": 0.1545899361371994, "learning_rate": 5.783333333333334e-06, "loss": 3.1108, "step": 8960 }, { "epoch": 1.0994077413931451, "grad_norm": 0.1597297489643097, "learning_rate": 5.605555555555555e-06, "loss": 3.1172, "step": 8992 }, { "epoch": 1.1033204692216576, "grad_norm": 0.16016387939453125, "learning_rate": 5.427777777777778e-06, "loss": 3.1156, "step": 9024 }, { "epoch": 1.10723319705017, "grad_norm": 0.15304987132549286, "learning_rate": 5.25e-06, "loss": 3.1126, "step": 9056 }, { "epoch": 1.1111459248786826, "grad_norm": 0.1560225784778595, "learning_rate": 5.072222222222222e-06, "loss": 3.1152, "step": 9088 }, { "epoch": 1.115058652707195, "grad_norm": 0.16613492369651794, "learning_rate": 4.894444444444445e-06, "loss": 3.1147, "step": 9120 }, { "epoch": 1.1189713805357075, "grad_norm": 0.15055406093597412, "learning_rate": 4.7166666666666675e-06, "loss": 3.1116, "step": 9152 }, { "epoch": 1.12288410836422, "grad_norm": 0.16280752420425415, "learning_rate": 4.538888888888889e-06, "loss": 3.1148, "step": 9184 }, { "epoch": 1.1267968361927325, "grad_norm": 0.1523207277059555, "learning_rate": 4.361111111111112e-06, "loss": 3.1133, "step": 9216 }, { "epoch": 1.1307095640212448, "grad_norm": 0.1500737965106964, "learning_rate": 4.183333333333334e-06, "loss": 3.1177, "step": 9248 }, { "epoch": 1.1346222918497573, "grad_norm": 0.16134943068027496, "learning_rate": 4.005555555555555e-06, "loss": 3.1143, "step": 9280 }, { "epoch": 1.1385350196782698, "grad_norm": 0.1499546766281128, "learning_rate": 3.827777777777778e-06, "loss": 3.1133, "step": 9312 }, { "epoch": 1.1424477475067822, "grad_norm": 0.15620845556259155, "learning_rate": 3.6499999999999998e-06, "loss": 3.1122, "step": 9344 }, { "epoch": 1.1463604753352947, "grad_norm": 0.15544985234737396, "learning_rate": 3.4722222222222224e-06, "loss": 3.1146, "step": 9376 }, { "epoch": 1.1502732031638072, "grad_norm": 0.15928788483142853, "learning_rate": 3.2944444444444446e-06, "loss": 3.1123, "step": 9408 }, { "epoch": 1.1541859309923197, "grad_norm": 0.14999979734420776, "learning_rate": 3.1166666666666668e-06, "loss": 3.1149, "step": 9440 }, { "epoch": 1.1580986588208322, "grad_norm": 0.15014442801475525, "learning_rate": 2.938888888888889e-06, "loss": 3.1113, "step": 9472 }, { "epoch": 1.1620113866493447, "grad_norm": 0.14749625325202942, "learning_rate": 2.761111111111111e-06, "loss": 3.113, "step": 9504 }, { "epoch": 1.1659241144778572, "grad_norm": 0.14931970834732056, "learning_rate": 2.5833333333333333e-06, "loss": 3.1144, "step": 9536 }, { "epoch": 1.1698368423063696, "grad_norm": 0.14572674036026, "learning_rate": 2.4055555555555555e-06, "loss": 3.1093, "step": 9568 }, { "epoch": 1.1737495701348821, "grad_norm": 0.15361888706684113, "learning_rate": 2.227777777777778e-06, "loss": 3.1138, "step": 9600 }, { "epoch": 1.1776622979633946, "grad_norm": 0.1433536857366562, "learning_rate": 2.0500000000000003e-06, "loss": 3.1123, "step": 9632 }, { "epoch": 1.181575025791907, "grad_norm": 0.14533208310604095, "learning_rate": 1.8722222222222225e-06, "loss": 3.1116, "step": 9664 }, { "epoch": 1.1854877536204196, "grad_norm": 0.14816279709339142, "learning_rate": 1.6944444444444446e-06, "loss": 3.1128, "step": 9696 }, { "epoch": 1.189400481448932, "grad_norm": 0.14798638224601746, "learning_rate": 1.5166666666666668e-06, "loss": 3.116, "step": 9728 }, { "epoch": 1.1933132092774446, "grad_norm": 0.1386597454547882, "learning_rate": 1.338888888888889e-06, "loss": 3.1145, "step": 9760 }, { "epoch": 1.197225937105957, "grad_norm": 0.14148685336112976, "learning_rate": 1.161111111111111e-06, "loss": 3.1115, "step": 9792 }, { "epoch": 1.2011386649344695, "grad_norm": 0.14324016869068146, "learning_rate": 9.833333333333334e-07, "loss": 3.1117, "step": 9824 }, { "epoch": 1.205051392762982, "grad_norm": 0.14499281346797943, "learning_rate": 8.055555555555556e-07, "loss": 3.1129, "step": 9856 }, { "epoch": 1.2089641205914945, "grad_norm": 0.1464635133743286, "learning_rate": 6.277777777777778e-07, "loss": 3.1169, "step": 9888 }, { "epoch": 1.2128768484200068, "grad_norm": 0.14767299592494965, "learning_rate": 4.5e-07, "loss": 3.1131, "step": 9920 }, { "epoch": 1.2167895762485195, "grad_norm": 0.14456725120544434, "learning_rate": 2.722222222222222e-07, "loss": 3.116, "step": 9952 }, { "epoch": 1.2207023040770317, "grad_norm": 0.1386868953704834, "learning_rate": 9.444444444444445e-08, "loss": 3.1151, "step": 9984 }, { "epoch": 1.222658667991288, "step": 10000, "total_flos": 8.246852548747592e+18, "train_loss": 1.5593041332244872, "train_runtime": 85792.9956, "train_samples_per_second": 238.714, "train_steps_per_second": 0.117 } ], "logging_steps": 32, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.246852548747592e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }