{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.990047413608987, "eval_steps": 1000, "global_step": 70000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004271496305155696, "grad_norm": 5.181503772735596, "learning_rate": 7.118451025056948e-07, "loss": 5.7231, "step": 100 }, { "epoch": 0.008542992610311393, "grad_norm": 4.810484886169434, "learning_rate": 1.4236902050113896e-06, "loss": 5.663, "step": 200 }, { "epoch": 0.012814488915467088, "grad_norm": 6.623730182647705, "learning_rate": 2.1355353075170844e-06, "loss": 5.8302, "step": 300 }, { "epoch": 0.017085985220622785, "grad_norm": 4.488267421722412, "learning_rate": 2.847380410022779e-06, "loss": 5.5719, "step": 400 }, { "epoch": 0.02135748152577848, "grad_norm": 5.217711448669434, "learning_rate": 3.559225512528474e-06, "loss": 5.572, "step": 500 }, { "epoch": 0.025628977830934176, "grad_norm": 6.749180316925049, "learning_rate": 4.271070615034169e-06, "loss": 5.6101, "step": 600 }, { "epoch": 0.029900474136089872, "grad_norm": 7.648950099945068, "learning_rate": 4.9829157175398636e-06, "loss": 5.6377, "step": 700 }, { "epoch": 0.03417197044124557, "grad_norm": 8.535877227783203, "learning_rate": 5.694760820045558e-06, "loss": 5.3197, "step": 800 }, { "epoch": 0.03844346674640126, "grad_norm": 10.138134002685547, "learning_rate": 6.406605922551254e-06, "loss": 5.2204, "step": 900 }, { "epoch": 0.04271496305155696, "grad_norm": 9.492013931274414, "learning_rate": 7.118451025056948e-06, "loss": 5.1271, "step": 1000 }, { "epoch": 0.04271496305155696, "eval_runtime": 404.87, "eval_samples_per_second": 115.647, "eval_steps_per_second": 14.456, "step": 1000 }, { "epoch": 0.046986459356712654, "grad_norm": 9.8007230758667, "learning_rate": 7.830296127562643e-06, "loss": 4.8906, "step": 1100 }, { "epoch": 0.05125795566186835, "grad_norm": 15.544700622558594, "learning_rate": 8.542141230068338e-06, "loss": 4.7123, "step": 1200 }, { "epoch": 0.05552945196702405, "grad_norm": 11.552872657775879, "learning_rate": 9.253986332574032e-06, "loss": 4.542, "step": 1300 }, { "epoch": 0.059800948272179744, "grad_norm": 8.390003204345703, "learning_rate": 9.965831435079727e-06, "loss": 4.1772, "step": 1400 }, { "epoch": 0.06407244457733544, "grad_norm": 6.352422714233398, "learning_rate": 1.0677676537585422e-05, "loss": 4.1877, "step": 1500 }, { "epoch": 0.06834394088249114, "grad_norm": 11.394415855407715, "learning_rate": 1.1389521640091117e-05, "loss": 4.1625, "step": 1600 }, { "epoch": 0.07261543718764683, "grad_norm": 7.095893859863281, "learning_rate": 1.2101366742596812e-05, "loss": 4.0512, "step": 1700 }, { "epoch": 0.07688693349280253, "grad_norm": 11.011418342590332, "learning_rate": 1.2813211845102508e-05, "loss": 4.0444, "step": 1800 }, { "epoch": 0.08115842979795823, "grad_norm": 7.294766902923584, "learning_rate": 1.35250569476082e-05, "loss": 4.1648, "step": 1900 }, { "epoch": 0.08542992610311392, "grad_norm": 9.359979629516602, "learning_rate": 1.4236902050113896e-05, "loss": 4.1958, "step": 2000 }, { "epoch": 0.08542992610311392, "eval_runtime": 404.9689, "eval_samples_per_second": 115.619, "eval_steps_per_second": 14.453, "step": 2000 }, { "epoch": 0.08970142240826962, "grad_norm": 9.699823379516602, "learning_rate": 1.494874715261959e-05, "loss": 4.0555, "step": 2100 }, { "epoch": 0.09397291871342531, "grad_norm": 9.576581954956055, "learning_rate": 1.5660592255125285e-05, "loss": 4.1073, "step": 2200 }, { "epoch": 0.09824441501858101, "grad_norm": 9.165473937988281, "learning_rate": 1.637243735763098e-05, "loss": 4.0373, "step": 2300 }, { "epoch": 0.1025159113237367, "grad_norm": 9.258238792419434, "learning_rate": 1.7084282460136675e-05, "loss": 3.8695, "step": 2400 }, { "epoch": 0.1067874076288924, "grad_norm": 10.60352897644043, "learning_rate": 1.779612756264237e-05, "loss": 3.8457, "step": 2500 }, { "epoch": 0.1110589039340481, "grad_norm": 8.64367961883545, "learning_rate": 1.8507972665148065e-05, "loss": 3.8887, "step": 2600 }, { "epoch": 0.1153304002392038, "grad_norm": 11.192293167114258, "learning_rate": 1.9219817767653758e-05, "loss": 4.0215, "step": 2700 }, { "epoch": 0.11960189654435949, "grad_norm": 11.153294563293457, "learning_rate": 1.9931662870159454e-05, "loss": 3.9655, "step": 2800 }, { "epoch": 0.12387339284951518, "grad_norm": 9.78614616394043, "learning_rate": 2.064350797266515e-05, "loss": 4.0018, "step": 2900 }, { "epoch": 0.12814488915467087, "grad_norm": 11.2694730758667, "learning_rate": 2.1355353075170844e-05, "loss": 3.9469, "step": 3000 }, { "epoch": 0.12814488915467087, "eval_runtime": 404.9852, "eval_samples_per_second": 115.614, "eval_steps_per_second": 14.452, "step": 3000 }, { "epoch": 0.1324163854598266, "grad_norm": 9.227404594421387, "learning_rate": 2.2067198177676537e-05, "loss": 3.9662, "step": 3100 }, { "epoch": 0.13668788176498228, "grad_norm": 13.020267486572266, "learning_rate": 2.2779043280182233e-05, "loss": 3.9011, "step": 3200 }, { "epoch": 0.14095937807013798, "grad_norm": 8.806400299072266, "learning_rate": 2.349088838268793e-05, "loss": 3.9507, "step": 3300 }, { "epoch": 0.14523087437529367, "grad_norm": 7.716139793395996, "learning_rate": 2.4202733485193623e-05, "loss": 3.9536, "step": 3400 }, { "epoch": 0.14950237068044936, "grad_norm": 12.768115997314453, "learning_rate": 2.4914578587699316e-05, "loss": 3.9268, "step": 3500 }, { "epoch": 0.15377386698560505, "grad_norm": 10.236725807189941, "learning_rate": 2.5626423690205016e-05, "loss": 3.8231, "step": 3600 }, { "epoch": 0.15804536329076074, "grad_norm": 13.891462326049805, "learning_rate": 2.633826879271071e-05, "loss": 3.8843, "step": 3700 }, { "epoch": 0.16231685959591646, "grad_norm": 12.619128227233887, "learning_rate": 2.70501138952164e-05, "loss": 3.8942, "step": 3800 }, { "epoch": 0.16658835590107215, "grad_norm": 8.733612060546875, "learning_rate": 2.77619589977221e-05, "loss": 3.7588, "step": 3900 }, { "epoch": 0.17085985220622785, "grad_norm": 10.724875450134277, "learning_rate": 2.8473804100227792e-05, "loss": 3.8525, "step": 4000 }, { "epoch": 0.17085985220622785, "eval_runtime": 404.7816, "eval_samples_per_second": 115.672, "eval_steps_per_second": 14.46, "step": 4000 }, { "epoch": 0.17513134851138354, "grad_norm": 9.703784942626953, "learning_rate": 2.9185649202733488e-05, "loss": 3.7397, "step": 4100 }, { "epoch": 0.17940284481653923, "grad_norm": 10.537924766540527, "learning_rate": 2.989749430523918e-05, "loss": 3.8457, "step": 4200 }, { "epoch": 0.18367434112169492, "grad_norm": 13.7029447555542, "learning_rate": 3.0609339407744874e-05, "loss": 3.8889, "step": 4300 }, { "epoch": 0.18794583742685061, "grad_norm": 11.692300796508789, "learning_rate": 3.132118451025057e-05, "loss": 3.9057, "step": 4400 }, { "epoch": 0.19221733373200633, "grad_norm": 11.873428344726562, "learning_rate": 3.203302961275627e-05, "loss": 3.8959, "step": 4500 }, { "epoch": 0.19648883003716203, "grad_norm": 10.291272163391113, "learning_rate": 3.274487471526196e-05, "loss": 3.8212, "step": 4600 }, { "epoch": 0.20076032634231772, "grad_norm": 10.874945640563965, "learning_rate": 3.3456719817767654e-05, "loss": 3.8884, "step": 4700 }, { "epoch": 0.2050318226474734, "grad_norm": 15.713820457458496, "learning_rate": 3.416856492027335e-05, "loss": 3.9066, "step": 4800 }, { "epoch": 0.2093033189526291, "grad_norm": 11.526785850524902, "learning_rate": 3.488041002277905e-05, "loss": 3.7686, "step": 4900 }, { "epoch": 0.2135748152577848, "grad_norm": 10.42326545715332, "learning_rate": 3.559225512528474e-05, "loss": 3.784, "step": 5000 }, { "epoch": 0.2135748152577848, "eval_runtime": 404.8034, "eval_samples_per_second": 115.666, "eval_steps_per_second": 14.459, "step": 5000 }, { "epoch": 0.2178463115629405, "grad_norm": 14.717082023620605, "learning_rate": 3.630410022779043e-05, "loss": 3.8462, "step": 5100 }, { "epoch": 0.2221178078680962, "grad_norm": 11.965718269348145, "learning_rate": 3.701594533029613e-05, "loss": 3.9435, "step": 5200 }, { "epoch": 0.2263893041732519, "grad_norm": 10.752185821533203, "learning_rate": 3.7727790432801826e-05, "loss": 3.8663, "step": 5300 }, { "epoch": 0.2306608004784076, "grad_norm": 12.059910774230957, "learning_rate": 3.8439635535307516e-05, "loss": 3.7925, "step": 5400 }, { "epoch": 0.23493229678356328, "grad_norm": 9.081160545349121, "learning_rate": 3.915148063781321e-05, "loss": 3.8639, "step": 5500 }, { "epoch": 0.23920379308871897, "grad_norm": 10.45064926147461, "learning_rate": 3.986332574031891e-05, "loss": 3.8497, "step": 5600 }, { "epoch": 0.24347528939387467, "grad_norm": 15.188603401184082, "learning_rate": 4.0575170842824605e-05, "loss": 3.7753, "step": 5700 }, { "epoch": 0.24774678569903036, "grad_norm": 9.032523155212402, "learning_rate": 4.12870159453303e-05, "loss": 3.8309, "step": 5800 }, { "epoch": 0.2520182820041861, "grad_norm": 9.886519432067871, "learning_rate": 4.199886104783599e-05, "loss": 3.8508, "step": 5900 }, { "epoch": 0.25628977830934174, "grad_norm": 11.432881355285645, "learning_rate": 4.271070615034169e-05, "loss": 3.8327, "step": 6000 }, { "epoch": 0.25628977830934174, "eval_runtime": 404.7732, "eval_samples_per_second": 115.675, "eval_steps_per_second": 14.46, "step": 6000 }, { "epoch": 0.26056127461449746, "grad_norm": 10.134676933288574, "learning_rate": 4.3422551252847384e-05, "loss": 3.8419, "step": 6100 }, { "epoch": 0.2648327709196532, "grad_norm": 12.583077430725098, "learning_rate": 4.4134396355353074e-05, "loss": 3.884, "step": 6200 }, { "epoch": 0.26910426722480885, "grad_norm": 9.845976829528809, "learning_rate": 4.484624145785877e-05, "loss": 3.7787, "step": 6300 }, { "epoch": 0.27337576352996457, "grad_norm": 21.58133888244629, "learning_rate": 4.555808656036447e-05, "loss": 3.8962, "step": 6400 }, { "epoch": 0.27764725983512023, "grad_norm": 12.139480590820312, "learning_rate": 4.626993166287016e-05, "loss": 3.722, "step": 6500 }, { "epoch": 0.28191875614027595, "grad_norm": 8.343817710876465, "learning_rate": 4.698177676537586e-05, "loss": 3.8009, "step": 6600 }, { "epoch": 0.2861902524454316, "grad_norm": 17.52387809753418, "learning_rate": 4.769362186788155e-05, "loss": 3.8687, "step": 6700 }, { "epoch": 0.29046174875058733, "grad_norm": 7.540428638458252, "learning_rate": 4.8405466970387246e-05, "loss": 3.727, "step": 6800 }, { "epoch": 0.29473324505574305, "grad_norm": 11.794758796691895, "learning_rate": 4.911731207289294e-05, "loss": 3.7971, "step": 6900 }, { "epoch": 0.2990047413608987, "grad_norm": 10.799798011779785, "learning_rate": 4.982915717539863e-05, "loss": 3.8319, "step": 7000 }, { "epoch": 0.2990047413608987, "eval_runtime": 404.96, "eval_samples_per_second": 115.621, "eval_steps_per_second": 14.453, "step": 7000 }, { "epoch": 0.30327623766605444, "grad_norm": 10.189812660217285, "learning_rate": 4.993988197883213e-05, "loss": 3.8696, "step": 7100 }, { "epoch": 0.3075477339712101, "grad_norm": 9.960589408874512, "learning_rate": 4.986077931940072e-05, "loss": 3.7356, "step": 7200 }, { "epoch": 0.3118192302763658, "grad_norm": 19.682788848876953, "learning_rate": 4.978167665996931e-05, "loss": 3.8524, "step": 7300 }, { "epoch": 0.3160907265815215, "grad_norm": 10.302993774414062, "learning_rate": 4.9702574000537896e-05, "loss": 3.6824, "step": 7400 }, { "epoch": 0.3203622228866772, "grad_norm": 11.691702842712402, "learning_rate": 4.9623471341106494e-05, "loss": 3.8522, "step": 7500 }, { "epoch": 0.3246337191918329, "grad_norm": 8.01658821105957, "learning_rate": 4.954436868167508e-05, "loss": 3.8386, "step": 7600 }, { "epoch": 0.3289052154969886, "grad_norm": 8.131669998168945, "learning_rate": 4.946526602224367e-05, "loss": 3.7587, "step": 7700 }, { "epoch": 0.3331767118021443, "grad_norm": 63.0767822265625, "learning_rate": 4.938616336281226e-05, "loss": 3.9019, "step": 7800 }, { "epoch": 0.3374482081073, "grad_norm": 10.04031753540039, "learning_rate": 4.930706070338085e-05, "loss": 3.8661, "step": 7900 }, { "epoch": 0.3417197044124557, "grad_norm": 8.829032897949219, "learning_rate": 4.922795804394944e-05, "loss": 3.8105, "step": 8000 }, { "epoch": 0.3417197044124557, "eval_runtime": 405.1664, "eval_samples_per_second": 115.562, "eval_steps_per_second": 14.446, "step": 8000 }, { "epoch": 0.34599120071761136, "grad_norm": 15.314312934875488, "learning_rate": 4.914885538451803e-05, "loss": 3.6279, "step": 8100 }, { "epoch": 0.3502626970227671, "grad_norm": 8.690494537353516, "learning_rate": 4.906975272508662e-05, "loss": 3.8336, "step": 8200 }, { "epoch": 0.3545341933279228, "grad_norm": 10.526517868041992, "learning_rate": 4.8990650065655206e-05, "loss": 3.8046, "step": 8300 }, { "epoch": 0.35880568963307846, "grad_norm": 7.85405969619751, "learning_rate": 4.8911547406223804e-05, "loss": 3.7721, "step": 8400 }, { "epoch": 0.3630771859382342, "grad_norm": 11.473519325256348, "learning_rate": 4.883244474679239e-05, "loss": 3.7854, "step": 8500 }, { "epoch": 0.36734868224338985, "grad_norm": 9.746980667114258, "learning_rate": 4.875334208736098e-05, "loss": 3.7573, "step": 8600 }, { "epoch": 0.37162017854854557, "grad_norm": 9.924898147583008, "learning_rate": 4.867423942792957e-05, "loss": 3.8054, "step": 8700 }, { "epoch": 0.37589167485370123, "grad_norm": 8.137608528137207, "learning_rate": 4.859513676849816e-05, "loss": 3.8245, "step": 8800 }, { "epoch": 0.38016317115885695, "grad_norm": 8.987218856811523, "learning_rate": 4.851603410906675e-05, "loss": 3.742, "step": 8900 }, { "epoch": 0.38443466746401267, "grad_norm": 9.04791259765625, "learning_rate": 4.843693144963534e-05, "loss": 3.7748, "step": 9000 }, { "epoch": 0.38443466746401267, "eval_runtime": 404.3649, "eval_samples_per_second": 115.791, "eval_steps_per_second": 14.475, "step": 9000 }, { "epoch": 0.38870616376916833, "grad_norm": 10.426161766052246, "learning_rate": 4.835782879020393e-05, "loss": 3.823, "step": 9100 }, { "epoch": 0.39297766007432405, "grad_norm": 8.951033592224121, "learning_rate": 4.8278726130772516e-05, "loss": 3.5732, "step": 9200 }, { "epoch": 0.3972491563794797, "grad_norm": 10.766894340515137, "learning_rate": 4.819962347134111e-05, "loss": 3.7286, "step": 9300 }, { "epoch": 0.40152065268463544, "grad_norm": 9.429593086242676, "learning_rate": 4.81205208119097e-05, "loss": 3.7493, "step": 9400 }, { "epoch": 0.4057921489897911, "grad_norm": 14.518060684204102, "learning_rate": 4.804141815247829e-05, "loss": 3.81, "step": 9500 }, { "epoch": 0.4100636452949468, "grad_norm": 20.795759201049805, "learning_rate": 4.7962315493046875e-05, "loss": 3.7045, "step": 9600 }, { "epoch": 0.41433514160010254, "grad_norm": 10.095163345336914, "learning_rate": 4.788321283361547e-05, "loss": 3.7767, "step": 9700 }, { "epoch": 0.4186066379052582, "grad_norm": 14.195402145385742, "learning_rate": 4.780411017418406e-05, "loss": 3.6874, "step": 9800 }, { "epoch": 0.4228781342104139, "grad_norm": 8.357501029968262, "learning_rate": 4.772500751475265e-05, "loss": 3.6675, "step": 9900 }, { "epoch": 0.4271496305155696, "grad_norm": 8.715255737304688, "learning_rate": 4.764590485532124e-05, "loss": 3.7657, "step": 10000 }, { "epoch": 0.4271496305155696, "eval_runtime": 405.002, "eval_samples_per_second": 115.609, "eval_steps_per_second": 14.452, "step": 10000 }, { "epoch": 0.4314211268207253, "grad_norm": 11.821653366088867, "learning_rate": 4.7566802195889826e-05, "loss": 3.6386, "step": 10100 }, { "epoch": 0.435692623125881, "grad_norm": 10.1320161819458, "learning_rate": 4.748769953645842e-05, "loss": 3.808, "step": 10200 }, { "epoch": 0.4399641194310367, "grad_norm": 9.089192390441895, "learning_rate": 4.740859687702701e-05, "loss": 3.7376, "step": 10300 }, { "epoch": 0.4442356157361924, "grad_norm": 9.963017463684082, "learning_rate": 4.73294942175956e-05, "loss": 3.8257, "step": 10400 }, { "epoch": 0.4485071120413481, "grad_norm": 9.806687355041504, "learning_rate": 4.7250391558164185e-05, "loss": 3.7903, "step": 10500 }, { "epoch": 0.4527786083465038, "grad_norm": 10.770492553710938, "learning_rate": 4.7171288898732776e-05, "loss": 3.7205, "step": 10600 }, { "epoch": 0.45705010465165946, "grad_norm": 11.635176658630371, "learning_rate": 4.709218623930137e-05, "loss": 3.6938, "step": 10700 }, { "epoch": 0.4613216009568152, "grad_norm": 8.8875732421875, "learning_rate": 4.701308357986996e-05, "loss": 3.7123, "step": 10800 }, { "epoch": 0.46559309726197085, "grad_norm": 7.193441867828369, "learning_rate": 4.6933980920438544e-05, "loss": 3.6563, "step": 10900 }, { "epoch": 0.46986459356712656, "grad_norm": 9.051284790039062, "learning_rate": 4.6854878261007135e-05, "loss": 3.7284, "step": 11000 }, { "epoch": 0.46986459356712656, "eval_runtime": 405.9538, "eval_samples_per_second": 115.338, "eval_steps_per_second": 14.418, "step": 11000 }, { "epoch": 0.4741360898722823, "grad_norm": 12.352762222290039, "learning_rate": 4.677577560157573e-05, "loss": 3.789, "step": 11100 }, { "epoch": 0.47840758617743795, "grad_norm": 9.574907302856445, "learning_rate": 4.669667294214432e-05, "loss": 3.7442, "step": 11200 }, { "epoch": 0.48267908248259367, "grad_norm": 9.639921188354492, "learning_rate": 4.661757028271291e-05, "loss": 3.7511, "step": 11300 }, { "epoch": 0.48695057878774933, "grad_norm": 10.295048713684082, "learning_rate": 4.6538467623281494e-05, "loss": 3.7032, "step": 11400 }, { "epoch": 0.49122207509290505, "grad_norm": 8.52436351776123, "learning_rate": 4.6459364963850086e-05, "loss": 3.7266, "step": 11500 }, { "epoch": 0.4954935713980607, "grad_norm": 12.574061393737793, "learning_rate": 4.638026230441868e-05, "loss": 3.7196, "step": 11600 }, { "epoch": 0.49976506770321644, "grad_norm": 12.421334266662598, "learning_rate": 4.630115964498727e-05, "loss": 3.6734, "step": 11700 }, { "epoch": 0.5040365640083722, "grad_norm": 14.321782112121582, "learning_rate": 4.6222056985555854e-05, "loss": 3.8048, "step": 11800 }, { "epoch": 0.5083080603135278, "grad_norm": 12.409293174743652, "learning_rate": 4.614295432612445e-05, "loss": 3.7366, "step": 11900 }, { "epoch": 0.5125795566186835, "grad_norm": 9.002853393554688, "learning_rate": 4.6063851666693037e-05, "loss": 3.7532, "step": 12000 }, { "epoch": 0.5125795566186835, "eval_runtime": 404.8624, "eval_samples_per_second": 115.649, "eval_steps_per_second": 14.457, "step": 12000 }, { "epoch": 0.5168510529238393, "grad_norm": 8.214455604553223, "learning_rate": 4.598474900726162e-05, "loss": 3.7221, "step": 12100 }, { "epoch": 0.5211225492289949, "grad_norm": 7.018616199493408, "learning_rate": 4.590564634783022e-05, "loss": 3.6718, "step": 12200 }, { "epoch": 0.5253940455341506, "grad_norm": 12.610575675964355, "learning_rate": 4.5826543688398804e-05, "loss": 3.7282, "step": 12300 }, { "epoch": 0.5296655418393064, "grad_norm": 10.168871879577637, "learning_rate": 4.5747441028967396e-05, "loss": 3.7455, "step": 12400 }, { "epoch": 0.533937038144462, "grad_norm": 9.421287536621094, "learning_rate": 4.566833836953599e-05, "loss": 3.8177, "step": 12500 }, { "epoch": 0.5382085344496177, "grad_norm": 11.314359664916992, "learning_rate": 4.558923571010458e-05, "loss": 3.7797, "step": 12600 }, { "epoch": 0.5424800307547734, "grad_norm": 10.510274887084961, "learning_rate": 4.5510133050673163e-05, "loss": 3.7639, "step": 12700 }, { "epoch": 0.5467515270599291, "grad_norm": 14.740921020507812, "learning_rate": 4.5431030391241755e-05, "loss": 3.8299, "step": 12800 }, { "epoch": 0.5510230233650848, "grad_norm": 7.322781562805176, "learning_rate": 4.5351927731810346e-05, "loss": 3.8357, "step": 12900 }, { "epoch": 0.5552945196702405, "grad_norm": 10.399321556091309, "learning_rate": 4.527282507237894e-05, "loss": 3.6613, "step": 13000 }, { "epoch": 0.5552945196702405, "eval_runtime": 404.2488, "eval_samples_per_second": 115.825, "eval_steps_per_second": 14.479, "step": 13000 }, { "epoch": 0.5595660159753962, "grad_norm": 8.805388450622559, "learning_rate": 4.519372241294752e-05, "loss": 3.7299, "step": 13100 }, { "epoch": 0.5638375122805519, "grad_norm": 7.358932018280029, "learning_rate": 4.5114619753516114e-05, "loss": 3.7956, "step": 13200 }, { "epoch": 0.5681090085857076, "grad_norm": 8.338736534118652, "learning_rate": 4.5035517094084706e-05, "loss": 3.8274, "step": 13300 }, { "epoch": 0.5723805048908632, "grad_norm": 9.217434883117676, "learning_rate": 4.495641443465329e-05, "loss": 3.8071, "step": 13400 }, { "epoch": 0.576652001196019, "grad_norm": 8.726869583129883, "learning_rate": 4.487731177522189e-05, "loss": 3.7296, "step": 13500 }, { "epoch": 0.5809234975011747, "grad_norm": 11.813636779785156, "learning_rate": 4.479820911579047e-05, "loss": 3.8608, "step": 13600 }, { "epoch": 0.5851949938063303, "grad_norm": 11.595725059509277, "learning_rate": 4.4719106456359065e-05, "loss": 3.7096, "step": 13700 }, { "epoch": 0.5894664901114861, "grad_norm": 9.259355545043945, "learning_rate": 4.4640003796927656e-05, "loss": 3.6732, "step": 13800 }, { "epoch": 0.5937379864166418, "grad_norm": 13.34984016418457, "learning_rate": 4.456090113749625e-05, "loss": 3.8131, "step": 13900 }, { "epoch": 0.5980094827217974, "grad_norm": 10.516456604003906, "learning_rate": 4.448179847806483e-05, "loss": 3.7439, "step": 14000 }, { "epoch": 0.5980094827217974, "eval_runtime": 404.1711, "eval_samples_per_second": 115.847, "eval_steps_per_second": 14.481, "step": 14000 }, { "epoch": 0.6022809790269531, "grad_norm": 12.842930793762207, "learning_rate": 4.4402695818633424e-05, "loss": 3.7682, "step": 14100 }, { "epoch": 0.6065524753321089, "grad_norm": 19.421875, "learning_rate": 4.4323593159202015e-05, "loss": 3.663, "step": 14200 }, { "epoch": 0.6108239716372645, "grad_norm": 7.454352855682373, "learning_rate": 4.42444904997706e-05, "loss": 3.7463, "step": 14300 }, { "epoch": 0.6150954679424202, "grad_norm": 13.48552131652832, "learning_rate": 4.41653878403392e-05, "loss": 3.649, "step": 14400 }, { "epoch": 0.619366964247576, "grad_norm": 11.968147277832031, "learning_rate": 4.408628518090778e-05, "loss": 3.7516, "step": 14500 }, { "epoch": 0.6236384605527316, "grad_norm": 12.150687217712402, "learning_rate": 4.4007182521476375e-05, "loss": 3.7322, "step": 14600 }, { "epoch": 0.6279099568578873, "grad_norm": 8.789100646972656, "learning_rate": 4.3928079862044966e-05, "loss": 3.6886, "step": 14700 }, { "epoch": 0.632181453163043, "grad_norm": 10.21249008178711, "learning_rate": 4.384897720261356e-05, "loss": 3.6862, "step": 14800 }, { "epoch": 0.6364529494681987, "grad_norm": 9.880081176757812, "learning_rate": 4.376987454318214e-05, "loss": 3.6766, "step": 14900 }, { "epoch": 0.6407244457733544, "grad_norm": 11.831520080566406, "learning_rate": 4.3690771883750734e-05, "loss": 3.645, "step": 15000 }, { "epoch": 0.6407244457733544, "eval_runtime": 404.5696, "eval_samples_per_second": 115.733, "eval_steps_per_second": 14.467, "step": 15000 }, { "epoch": 0.6449959420785101, "grad_norm": 8.91051959991455, "learning_rate": 4.3611669224319325e-05, "loss": 3.6847, "step": 15100 }, { "epoch": 0.6492674383836659, "grad_norm": 9.260310173034668, "learning_rate": 4.353256656488791e-05, "loss": 3.7197, "step": 15200 }, { "epoch": 0.6535389346888215, "grad_norm": 10.138089179992676, "learning_rate": 4.34534639054565e-05, "loss": 3.6529, "step": 15300 }, { "epoch": 0.6578104309939772, "grad_norm": 8.813399314880371, "learning_rate": 4.337436124602509e-05, "loss": 3.6541, "step": 15400 }, { "epoch": 0.6620819272991328, "grad_norm": 9.144048690795898, "learning_rate": 4.3295258586593684e-05, "loss": 3.5101, "step": 15500 }, { "epoch": 0.6663534236042886, "grad_norm": 8.948995590209961, "learning_rate": 4.321615592716227e-05, "loss": 3.7848, "step": 15600 }, { "epoch": 0.6706249199094443, "grad_norm": 9.42180061340332, "learning_rate": 4.313705326773087e-05, "loss": 3.5926, "step": 15700 }, { "epoch": 0.6748964162146, "grad_norm": 8.974250793457031, "learning_rate": 4.305795060829945e-05, "loss": 3.6967, "step": 15800 }, { "epoch": 0.6791679125197557, "grad_norm": 12.110358238220215, "learning_rate": 4.2978847948868044e-05, "loss": 3.6521, "step": 15900 }, { "epoch": 0.6834394088249114, "grad_norm": 9.907513618469238, "learning_rate": 4.2899745289436635e-05, "loss": 3.703, "step": 16000 }, { "epoch": 0.6834394088249114, "eval_runtime": 404.4023, "eval_samples_per_second": 115.781, "eval_steps_per_second": 14.473, "step": 16000 }, { "epoch": 0.687710905130067, "grad_norm": 8.421221733093262, "learning_rate": 4.2820642630005226e-05, "loss": 3.5446, "step": 16100 }, { "epoch": 0.6919824014352227, "grad_norm": 8.890350341796875, "learning_rate": 4.274153997057381e-05, "loss": 3.7051, "step": 16200 }, { "epoch": 0.6962538977403785, "grad_norm": 7.985939979553223, "learning_rate": 4.26624373111424e-05, "loss": 3.7894, "step": 16300 }, { "epoch": 0.7005253940455342, "grad_norm": 9.559375762939453, "learning_rate": 4.2583334651710994e-05, "loss": 3.7158, "step": 16400 }, { "epoch": 0.7047968903506898, "grad_norm": 9.920624732971191, "learning_rate": 4.250423199227958e-05, "loss": 3.7286, "step": 16500 }, { "epoch": 0.7090683866558456, "grad_norm": 13.251816749572754, "learning_rate": 4.242512933284817e-05, "loss": 3.7271, "step": 16600 }, { "epoch": 0.7133398829610013, "grad_norm": 11.248520851135254, "learning_rate": 4.234602667341676e-05, "loss": 3.6633, "step": 16700 }, { "epoch": 0.7176113792661569, "grad_norm": 7.556328296661377, "learning_rate": 4.226692401398535e-05, "loss": 3.6706, "step": 16800 }, { "epoch": 0.7218828755713126, "grad_norm": 6.298122406005859, "learning_rate": 4.218782135455394e-05, "loss": 3.7307, "step": 16900 }, { "epoch": 0.7261543718764684, "grad_norm": 10.17672061920166, "learning_rate": 4.2108718695122536e-05, "loss": 3.7274, "step": 17000 }, { "epoch": 0.7261543718764684, "eval_runtime": 403.9594, "eval_samples_per_second": 115.908, "eval_steps_per_second": 14.489, "step": 17000 }, { "epoch": 0.730425868181624, "grad_norm": 8.036096572875977, "learning_rate": 4.202961603569112e-05, "loss": 3.641, "step": 17100 }, { "epoch": 0.7346973644867797, "grad_norm": 8.982136726379395, "learning_rate": 4.195051337625971e-05, "loss": 3.7079, "step": 17200 }, { "epoch": 0.7389688607919355, "grad_norm": 11.923723220825195, "learning_rate": 4.1871410716828304e-05, "loss": 3.7252, "step": 17300 }, { "epoch": 0.7432403570970911, "grad_norm": 10.336372375488281, "learning_rate": 4.179230805739689e-05, "loss": 3.6793, "step": 17400 }, { "epoch": 0.7475118534022468, "grad_norm": 10.367877960205078, "learning_rate": 4.171320539796548e-05, "loss": 3.6833, "step": 17500 }, { "epoch": 0.7517833497074025, "grad_norm": 8.473801612854004, "learning_rate": 4.163410273853407e-05, "loss": 3.6678, "step": 17600 }, { "epoch": 0.7560548460125582, "grad_norm": 7.864530563354492, "learning_rate": 4.155500007910266e-05, "loss": 3.6848, "step": 17700 }, { "epoch": 0.7603263423177139, "grad_norm": 10.16886043548584, "learning_rate": 4.147589741967125e-05, "loss": 3.8073, "step": 17800 }, { "epoch": 0.7645978386228696, "grad_norm": 10.161076545715332, "learning_rate": 4.1396794760239846e-05, "loss": 3.7171, "step": 17900 }, { "epoch": 0.7688693349280253, "grad_norm": 9.742669105529785, "learning_rate": 4.131769210080843e-05, "loss": 3.6089, "step": 18000 }, { "epoch": 0.7688693349280253, "eval_runtime": 403.6413, "eval_samples_per_second": 115.999, "eval_steps_per_second": 14.5, "step": 18000 }, { "epoch": 0.773140831233181, "grad_norm": 7.834203243255615, "learning_rate": 4.123858944137702e-05, "loss": 3.7618, "step": 18100 }, { "epoch": 0.7774123275383367, "grad_norm": 8.670865058898926, "learning_rate": 4.1159486781945614e-05, "loss": 3.5937, "step": 18200 }, { "epoch": 0.7816838238434923, "grad_norm": 10.17273998260498, "learning_rate": 4.10803841225142e-05, "loss": 3.6682, "step": 18300 }, { "epoch": 0.7859553201486481, "grad_norm": 7.384734630584717, "learning_rate": 4.100128146308279e-05, "loss": 3.7461, "step": 18400 }, { "epoch": 0.7902268164538038, "grad_norm": 9.957521438598633, "learning_rate": 4.092217880365138e-05, "loss": 3.646, "step": 18500 }, { "epoch": 0.7944983127589594, "grad_norm": 9.32741928100586, "learning_rate": 4.084307614421997e-05, "loss": 3.7085, "step": 18600 }, { "epoch": 0.7987698090641152, "grad_norm": 8.64340591430664, "learning_rate": 4.076397348478856e-05, "loss": 3.7954, "step": 18700 }, { "epoch": 0.8030413053692709, "grad_norm": 8.776473999023438, "learning_rate": 4.068487082535715e-05, "loss": 3.7171, "step": 18800 }, { "epoch": 0.8073128016744265, "grad_norm": 13.726973533630371, "learning_rate": 4.060576816592574e-05, "loss": 3.7593, "step": 18900 }, { "epoch": 0.8115842979795822, "grad_norm": 8.291767120361328, "learning_rate": 4.052666550649433e-05, "loss": 3.6206, "step": 19000 }, { "epoch": 0.8115842979795822, "eval_runtime": 404.755, "eval_samples_per_second": 115.68, "eval_steps_per_second": 14.461, "step": 19000 }, { "epoch": 0.815855794284738, "grad_norm": 9.087343215942383, "learning_rate": 4.044756284706292e-05, "loss": 3.6748, "step": 19100 }, { "epoch": 0.8201272905898936, "grad_norm": 7.206308364868164, "learning_rate": 4.0368460187631515e-05, "loss": 3.6976, "step": 19200 }, { "epoch": 0.8243987868950493, "grad_norm": 13.21072006225586, "learning_rate": 4.02893575282001e-05, "loss": 3.7547, "step": 19300 }, { "epoch": 0.8286702832002051, "grad_norm": 11.711308479309082, "learning_rate": 4.0210254868768685e-05, "loss": 3.7046, "step": 19400 }, { "epoch": 0.8329417795053607, "grad_norm": 7.493105411529541, "learning_rate": 4.013115220933728e-05, "loss": 3.6605, "step": 19500 }, { "epoch": 0.8372132758105164, "grad_norm": 10.920802116394043, "learning_rate": 4.005204954990587e-05, "loss": 3.701, "step": 19600 }, { "epoch": 0.8414847721156721, "grad_norm": 8.588319778442383, "learning_rate": 3.997294689047446e-05, "loss": 3.7119, "step": 19700 }, { "epoch": 0.8457562684208279, "grad_norm": 9.688274383544922, "learning_rate": 3.989384423104305e-05, "loss": 3.6968, "step": 19800 }, { "epoch": 0.8500277647259835, "grad_norm": 13.46649169921875, "learning_rate": 3.981474157161164e-05, "loss": 3.6608, "step": 19900 }, { "epoch": 0.8542992610311392, "grad_norm": 9.020798683166504, "learning_rate": 3.973563891218023e-05, "loss": 3.7014, "step": 20000 }, { "epoch": 0.8542992610311392, "eval_runtime": 404.5707, "eval_samples_per_second": 115.733, "eval_steps_per_second": 14.467, "step": 20000 }, { "epoch": 0.858570757336295, "grad_norm": 7.667457103729248, "learning_rate": 3.9656536252748825e-05, "loss": 3.6261, "step": 20100 }, { "epoch": 0.8628422536414506, "grad_norm": 10.752103805541992, "learning_rate": 3.957743359331741e-05, "loss": 3.7678, "step": 20200 }, { "epoch": 0.8671137499466063, "grad_norm": 8.758957862854004, "learning_rate": 3.9498330933885994e-05, "loss": 3.8176, "step": 20300 }, { "epoch": 0.871385246251762, "grad_norm": 9.372211456298828, "learning_rate": 3.941922827445459e-05, "loss": 3.6383, "step": 20400 }, { "epoch": 0.8756567425569177, "grad_norm": 10.67364501953125, "learning_rate": 3.934012561502318e-05, "loss": 3.7067, "step": 20500 }, { "epoch": 0.8799282388620734, "grad_norm": 12.151751518249512, "learning_rate": 3.926102295559177e-05, "loss": 3.7826, "step": 20600 }, { "epoch": 0.884199735167229, "grad_norm": 7.820495128631592, "learning_rate": 3.918192029616036e-05, "loss": 3.6867, "step": 20700 }, { "epoch": 0.8884712314723848, "grad_norm": 9.453180313110352, "learning_rate": 3.910281763672895e-05, "loss": 3.7301, "step": 20800 }, { "epoch": 0.8927427277775405, "grad_norm": 11.202925682067871, "learning_rate": 3.9023714977297536e-05, "loss": 3.6845, "step": 20900 }, { "epoch": 0.8970142240826962, "grad_norm": 13.49270248413086, "learning_rate": 3.894461231786613e-05, "loss": 3.7193, "step": 21000 }, { "epoch": 0.8970142240826962, "eval_runtime": 403.5135, "eval_samples_per_second": 116.036, "eval_steps_per_second": 14.505, "step": 21000 }, { "epoch": 0.9012857203878518, "grad_norm": 8.086437225341797, "learning_rate": 3.886550965843472e-05, "loss": 3.6406, "step": 21100 }, { "epoch": 0.9055572166930076, "grad_norm": 10.620895385742188, "learning_rate": 3.878640699900331e-05, "loss": 3.762, "step": 21200 }, { "epoch": 0.9098287129981633, "grad_norm": 6.320925712585449, "learning_rate": 3.8707304339571896e-05, "loss": 3.7283, "step": 21300 }, { "epoch": 0.9141002093033189, "grad_norm": 8.072772026062012, "learning_rate": 3.862820168014049e-05, "loss": 3.6657, "step": 21400 }, { "epoch": 0.9183717056084747, "grad_norm": 8.310846328735352, "learning_rate": 3.854909902070908e-05, "loss": 3.7271, "step": 21500 }, { "epoch": 0.9226432019136304, "grad_norm": 6.958920478820801, "learning_rate": 3.846999636127766e-05, "loss": 3.8027, "step": 21600 }, { "epoch": 0.926914698218786, "grad_norm": 10.530051231384277, "learning_rate": 3.839089370184626e-05, "loss": 3.6227, "step": 21700 }, { "epoch": 0.9311861945239417, "grad_norm": 9.503134727478027, "learning_rate": 3.8311791042414846e-05, "loss": 3.7262, "step": 21800 }, { "epoch": 0.9354576908290975, "grad_norm": 8.891386985778809, "learning_rate": 3.823268838298344e-05, "loss": 3.6882, "step": 21900 }, { "epoch": 0.9397291871342531, "grad_norm": 9.793424606323242, "learning_rate": 3.815358572355203e-05, "loss": 3.7304, "step": 22000 }, { "epoch": 0.9397291871342531, "eval_runtime": 404.4356, "eval_samples_per_second": 115.771, "eval_steps_per_second": 14.472, "step": 22000 }, { "epoch": 0.9440006834394088, "grad_norm": 8.443710327148438, "learning_rate": 3.807448306412062e-05, "loss": 3.6348, "step": 22100 }, { "epoch": 0.9482721797445646, "grad_norm": 8.634336471557617, "learning_rate": 3.7995380404689205e-05, "loss": 3.6732, "step": 22200 }, { "epoch": 0.9525436760497202, "grad_norm": 9.720598220825195, "learning_rate": 3.79162777452578e-05, "loss": 3.7, "step": 22300 }, { "epoch": 0.9568151723548759, "grad_norm": 9.388401985168457, "learning_rate": 3.783717508582639e-05, "loss": 3.646, "step": 22400 }, { "epoch": 0.9610866686600316, "grad_norm": 8.947492599487305, "learning_rate": 3.775807242639497e-05, "loss": 3.5409, "step": 22500 }, { "epoch": 0.9653581649651873, "grad_norm": 9.596850395202637, "learning_rate": 3.7678969766963565e-05, "loss": 3.6636, "step": 22600 }, { "epoch": 0.969629661270343, "grad_norm": 12.37696361541748, "learning_rate": 3.7599867107532156e-05, "loss": 3.7021, "step": 22700 }, { "epoch": 0.9739011575754987, "grad_norm": 8.767573356628418, "learning_rate": 3.752076444810075e-05, "loss": 3.6152, "step": 22800 }, { "epoch": 0.9781726538806544, "grad_norm": 8.559804916381836, "learning_rate": 3.744166178866933e-05, "loss": 3.6866, "step": 22900 }, { "epoch": 0.9824441501858101, "grad_norm": 7.025428771972656, "learning_rate": 3.736255912923793e-05, "loss": 3.7933, "step": 23000 }, { "epoch": 0.9824441501858101, "eval_runtime": 403.8391, "eval_samples_per_second": 115.942, "eval_steps_per_second": 14.493, "step": 23000 }, { "epoch": 0.9867156464909658, "grad_norm": 8.558939933776855, "learning_rate": 3.7283456469806515e-05, "loss": 3.6242, "step": 23100 }, { "epoch": 0.9909871427961214, "grad_norm": 7.838054656982422, "learning_rate": 3.720435381037511e-05, "loss": 3.7873, "step": 23200 }, { "epoch": 0.9952586391012772, "grad_norm": 9.238251686096191, "learning_rate": 3.71252511509437e-05, "loss": 3.7437, "step": 23300 }, { "epoch": 0.9995301354064329, "grad_norm": 8.105572700500488, "learning_rate": 3.704614849151229e-05, "loss": 3.5954, "step": 23400 }, { "epoch": 1.0038016317115885, "grad_norm": 8.4044771194458, "learning_rate": 3.6967045832080874e-05, "loss": 3.6959, "step": 23500 }, { "epoch": 1.0080731280167443, "grad_norm": 7.410630702972412, "learning_rate": 3.6887943172649466e-05, "loss": 3.5738, "step": 23600 }, { "epoch": 1.0123446243218999, "grad_norm": 13.264152526855469, "learning_rate": 3.680884051321806e-05, "loss": 3.7171, "step": 23700 }, { "epoch": 1.0166161206270556, "grad_norm": 10.43344783782959, "learning_rate": 3.672973785378664e-05, "loss": 3.7067, "step": 23800 }, { "epoch": 1.0208876169322114, "grad_norm": 10.395238876342773, "learning_rate": 3.665063519435524e-05, "loss": 3.7069, "step": 23900 }, { "epoch": 1.025159113237367, "grad_norm": 9.611321449279785, "learning_rate": 3.6571532534923825e-05, "loss": 3.6583, "step": 24000 }, { "epoch": 1.025159113237367, "eval_runtime": 403.723, "eval_samples_per_second": 115.976, "eval_steps_per_second": 14.498, "step": 24000 }, { "epoch": 1.0294306095425227, "grad_norm": 7.200385570526123, "learning_rate": 3.6492429875492417e-05, "loss": 3.6306, "step": 24100 }, { "epoch": 1.0337021058476785, "grad_norm": 9.20836067199707, "learning_rate": 3.641332721606101e-05, "loss": 3.6762, "step": 24200 }, { "epoch": 1.037973602152834, "grad_norm": 7.563958644866943, "learning_rate": 3.63342245566296e-05, "loss": 3.7167, "step": 24300 }, { "epoch": 1.0422450984579898, "grad_norm": 13.854744911193848, "learning_rate": 3.6255121897198184e-05, "loss": 3.618, "step": 24400 }, { "epoch": 1.0465165947631456, "grad_norm": 7.969038963317871, "learning_rate": 3.6176019237766776e-05, "loss": 3.722, "step": 24500 }, { "epoch": 1.0507880910683012, "grad_norm": 9.738038063049316, "learning_rate": 3.609691657833537e-05, "loss": 3.7552, "step": 24600 }, { "epoch": 1.055059587373457, "grad_norm": 10.093921661376953, "learning_rate": 3.601781391890395e-05, "loss": 3.7502, "step": 24700 }, { "epoch": 1.0593310836786127, "grad_norm": 6.876298427581787, "learning_rate": 3.593871125947254e-05, "loss": 3.6343, "step": 24800 }, { "epoch": 1.0636025799837683, "grad_norm": 7.968320846557617, "learning_rate": 3.5859608600041135e-05, "loss": 3.6845, "step": 24900 }, { "epoch": 1.067874076288924, "grad_norm": 9.148797988891602, "learning_rate": 3.5780505940609726e-05, "loss": 3.6106, "step": 25000 }, { "epoch": 1.067874076288924, "eval_runtime": 403.8237, "eval_samples_per_second": 115.947, "eval_steps_per_second": 14.494, "step": 25000 }, { "epoch": 1.0721455725940796, "grad_norm": 9.72523307800293, "learning_rate": 3.570140328117831e-05, "loss": 3.6919, "step": 25100 }, { "epoch": 1.0764170688992354, "grad_norm": 9.23186206817627, "learning_rate": 3.562230062174691e-05, "loss": 3.6378, "step": 25200 }, { "epoch": 1.0806885652043912, "grad_norm": 8.581460952758789, "learning_rate": 3.5543197962315494e-05, "loss": 3.7005, "step": 25300 }, { "epoch": 1.0849600615095467, "grad_norm": 9.83565902709961, "learning_rate": 3.5464095302884085e-05, "loss": 3.6315, "step": 25400 }, { "epoch": 1.0892315578147025, "grad_norm": 7.4981770515441895, "learning_rate": 3.538499264345268e-05, "loss": 3.703, "step": 25500 }, { "epoch": 1.0935030541198583, "grad_norm": 9.84447193145752, "learning_rate": 3.530588998402126e-05, "loss": 3.703, "step": 25600 }, { "epoch": 1.0977745504250138, "grad_norm": 11.77198600769043, "learning_rate": 3.522678732458985e-05, "loss": 3.74, "step": 25700 }, { "epoch": 1.1020460467301696, "grad_norm": 9.35525131225586, "learning_rate": 3.5147684665158445e-05, "loss": 3.6574, "step": 25800 }, { "epoch": 1.1063175430353254, "grad_norm": 11.326153755187988, "learning_rate": 3.5068582005727036e-05, "loss": 3.6052, "step": 25900 }, { "epoch": 1.110589039340481, "grad_norm": 8.957196235656738, "learning_rate": 3.498947934629562e-05, "loss": 3.6924, "step": 26000 }, { "epoch": 1.110589039340481, "eval_runtime": 404.7232, "eval_samples_per_second": 115.689, "eval_steps_per_second": 14.462, "step": 26000 }, { "epoch": 1.1148605356456367, "grad_norm": 8.46112060546875, "learning_rate": 3.491037668686421e-05, "loss": 3.5975, "step": 26100 }, { "epoch": 1.1191320319507922, "grad_norm": 10.088958740234375, "learning_rate": 3.4831274027432804e-05, "loss": 3.6885, "step": 26200 }, { "epoch": 1.123403528255948, "grad_norm": 8.147522926330566, "learning_rate": 3.4752171368001395e-05, "loss": 3.587, "step": 26300 }, { "epoch": 1.1276750245611038, "grad_norm": 9.306445121765137, "learning_rate": 3.467306870856999e-05, "loss": 3.6943, "step": 26400 }, { "epoch": 1.1319465208662594, "grad_norm": 7.206762790679932, "learning_rate": 3.459396604913858e-05, "loss": 3.8088, "step": 26500 }, { "epoch": 1.1362180171714151, "grad_norm": 7.733761787414551, "learning_rate": 3.451486338970716e-05, "loss": 3.6604, "step": 26600 }, { "epoch": 1.140489513476571, "grad_norm": 12.075209617614746, "learning_rate": 3.4435760730275754e-05, "loss": 3.6716, "step": 26700 }, { "epoch": 1.1447610097817265, "grad_norm": 13.957979202270508, "learning_rate": 3.4356658070844346e-05, "loss": 3.6605, "step": 26800 }, { "epoch": 1.1490325060868822, "grad_norm": 6.747539520263672, "learning_rate": 3.427755541141293e-05, "loss": 3.6524, "step": 26900 }, { "epoch": 1.153304002392038, "grad_norm": 7.960626602172852, "learning_rate": 3.419845275198152e-05, "loss": 3.6574, "step": 27000 }, { "epoch": 1.153304002392038, "eval_runtime": 403.5226, "eval_samples_per_second": 116.033, "eval_steps_per_second": 14.505, "step": 27000 }, { "epoch": 1.1575754986971936, "grad_norm": 6.446537971496582, "learning_rate": 3.4119350092550114e-05, "loss": 3.6968, "step": 27100 }, { "epoch": 1.1618469950023493, "grad_norm": 8.360943794250488, "learning_rate": 3.4040247433118705e-05, "loss": 3.7481, "step": 27200 }, { "epoch": 1.1661184913075051, "grad_norm": 9.145593643188477, "learning_rate": 3.396114477368729e-05, "loss": 3.7588, "step": 27300 }, { "epoch": 1.1703899876126607, "grad_norm": 11.084358215332031, "learning_rate": 3.388204211425589e-05, "loss": 3.6393, "step": 27400 }, { "epoch": 1.1746614839178164, "grad_norm": 11.370959281921387, "learning_rate": 3.380293945482447e-05, "loss": 3.6566, "step": 27500 }, { "epoch": 1.1789329802229722, "grad_norm": 9.31324291229248, "learning_rate": 3.372383679539306e-05, "loss": 3.5833, "step": 27600 }, { "epoch": 1.1832044765281278, "grad_norm": 10.302188873291016, "learning_rate": 3.3644734135961656e-05, "loss": 3.6591, "step": 27700 }, { "epoch": 1.1874759728332835, "grad_norm": 9.487174034118652, "learning_rate": 3.356563147653024e-05, "loss": 3.6245, "step": 27800 }, { "epoch": 1.191747469138439, "grad_norm": 8.596895217895508, "learning_rate": 3.348652881709883e-05, "loss": 3.7252, "step": 27900 }, { "epoch": 1.1960189654435949, "grad_norm": 8.368951797485352, "learning_rate": 3.3407426157667423e-05, "loss": 3.7371, "step": 28000 }, { "epoch": 1.1960189654435949, "eval_runtime": 404.648, "eval_samples_per_second": 115.71, "eval_steps_per_second": 14.464, "step": 28000 }, { "epoch": 1.2002904617487506, "grad_norm": 15.659690856933594, "learning_rate": 3.3328323498236015e-05, "loss": 3.5971, "step": 28100 }, { "epoch": 1.2045619580539062, "grad_norm": 7.678028106689453, "learning_rate": 3.32492208388046e-05, "loss": 3.722, "step": 28200 }, { "epoch": 1.208833454359062, "grad_norm": 7.527515888214111, "learning_rate": 3.317011817937319e-05, "loss": 3.6987, "step": 28300 }, { "epoch": 1.2131049506642178, "grad_norm": 7.842383861541748, "learning_rate": 3.309101551994178e-05, "loss": 3.6394, "step": 28400 }, { "epoch": 1.2173764469693733, "grad_norm": 7.929213523864746, "learning_rate": 3.3011912860510374e-05, "loss": 3.6538, "step": 28500 }, { "epoch": 1.221647943274529, "grad_norm": 10.908970832824707, "learning_rate": 3.293281020107896e-05, "loss": 3.8107, "step": 28600 }, { "epoch": 1.2259194395796849, "grad_norm": 13.46042251586914, "learning_rate": 3.285370754164755e-05, "loss": 3.7662, "step": 28700 }, { "epoch": 1.2301909358848404, "grad_norm": 9.317851066589355, "learning_rate": 3.277460488221614e-05, "loss": 3.7211, "step": 28800 }, { "epoch": 1.2344624321899962, "grad_norm": 8.503124237060547, "learning_rate": 3.2695502222784726e-05, "loss": 3.5346, "step": 28900 }, { "epoch": 1.2387339284951517, "grad_norm": 7.7580718994140625, "learning_rate": 3.2616399563353325e-05, "loss": 3.7839, "step": 29000 }, { "epoch": 1.2387339284951517, "eval_runtime": 404.9737, "eval_samples_per_second": 115.617, "eval_steps_per_second": 14.453, "step": 29000 }, { "epoch": 1.2430054248003075, "grad_norm": 6.63585090637207, "learning_rate": 3.253729690392191e-05, "loss": 3.7525, "step": 29100 }, { "epoch": 1.2472769211054633, "grad_norm": 8.438802719116211, "learning_rate": 3.24581942444905e-05, "loss": 3.6626, "step": 29200 }, { "epoch": 1.251548417410619, "grad_norm": 10.101741790771484, "learning_rate": 3.237909158505909e-05, "loss": 3.6074, "step": 29300 }, { "epoch": 1.2558199137157746, "grad_norm": 7.5071797370910645, "learning_rate": 3.2299988925627684e-05, "loss": 3.6081, "step": 29400 }, { "epoch": 1.2600914100209304, "grad_norm": 8.632312774658203, "learning_rate": 3.222088626619627e-05, "loss": 3.5992, "step": 29500 }, { "epoch": 1.264362906326086, "grad_norm": 8.449037551879883, "learning_rate": 3.214178360676487e-05, "loss": 3.5541, "step": 29600 }, { "epoch": 1.2686344026312417, "grad_norm": 7.799576759338379, "learning_rate": 3.206268094733345e-05, "loss": 3.6541, "step": 29700 }, { "epoch": 1.2729058989363975, "grad_norm": 11.673965454101562, "learning_rate": 3.1983578287902036e-05, "loss": 3.6498, "step": 29800 }, { "epoch": 1.277177395241553, "grad_norm": 8.359036445617676, "learning_rate": 3.1904475628470635e-05, "loss": 3.6494, "step": 29900 }, { "epoch": 1.2814488915467088, "grad_norm": 13.046087265014648, "learning_rate": 3.182537296903922e-05, "loss": 3.8596, "step": 30000 }, { "epoch": 1.2814488915467088, "eval_runtime": 404.5555, "eval_samples_per_second": 115.737, "eval_steps_per_second": 14.468, "step": 30000 }, { "epoch": 1.2857203878518644, "grad_norm": 10.439358711242676, "learning_rate": 3.174627030960781e-05, "loss": 3.7013, "step": 30100 }, { "epoch": 1.2899918841570202, "grad_norm": 7.784947395324707, "learning_rate": 3.16671676501764e-05, "loss": 3.7573, "step": 30200 }, { "epoch": 1.294263380462176, "grad_norm": 11.142706871032715, "learning_rate": 3.1588064990744994e-05, "loss": 3.6061, "step": 30300 }, { "epoch": 1.2985348767673317, "grad_norm": 8.045978546142578, "learning_rate": 3.150896233131358e-05, "loss": 3.6615, "step": 30400 }, { "epoch": 1.3028063730724873, "grad_norm": 8.409544944763184, "learning_rate": 3.142985967188217e-05, "loss": 3.6075, "step": 30500 }, { "epoch": 1.307077869377643, "grad_norm": 10.13918685913086, "learning_rate": 3.135075701245076e-05, "loss": 3.6318, "step": 30600 }, { "epoch": 1.3113493656827986, "grad_norm": 10.452644348144531, "learning_rate": 3.1271654353019346e-05, "loss": 3.6653, "step": 30700 }, { "epoch": 1.3156208619879544, "grad_norm": 8.69783878326416, "learning_rate": 3.119255169358794e-05, "loss": 3.5687, "step": 30800 }, { "epoch": 1.3198923582931101, "grad_norm": 9.234668731689453, "learning_rate": 3.111344903415653e-05, "loss": 3.6629, "step": 30900 }, { "epoch": 1.324163854598266, "grad_norm": 7.855345249176025, "learning_rate": 3.103434637472512e-05, "loss": 3.7406, "step": 31000 }, { "epoch": 1.324163854598266, "eval_runtime": 404.076, "eval_samples_per_second": 115.874, "eval_steps_per_second": 14.485, "step": 31000 }, { "epoch": 1.3284353509034215, "grad_norm": 13.292342185974121, "learning_rate": 3.0955243715293705e-05, "loss": 3.7277, "step": 31100 }, { "epoch": 1.3327068472085772, "grad_norm": 14.126769065856934, "learning_rate": 3.0876141055862304e-05, "loss": 3.8065, "step": 31200 }, { "epoch": 1.3369783435137328, "grad_norm": 7.635355472564697, "learning_rate": 3.079703839643089e-05, "loss": 3.6653, "step": 31300 }, { "epoch": 1.3412498398188886, "grad_norm": 9.28641128540039, "learning_rate": 3.071793573699948e-05, "loss": 3.7125, "step": 31400 }, { "epoch": 1.3455213361240443, "grad_norm": 8.960599899291992, "learning_rate": 3.063883307756807e-05, "loss": 3.639, "step": 31500 }, { "epoch": 1.3497928324292, "grad_norm": 10.085050582885742, "learning_rate": 3.055973041813666e-05, "loss": 3.7317, "step": 31600 }, { "epoch": 1.3540643287343557, "grad_norm": 8.527816772460938, "learning_rate": 3.048062775870525e-05, "loss": 3.6044, "step": 31700 }, { "epoch": 1.3583358250395112, "grad_norm": 11.678420066833496, "learning_rate": 3.0401525099273835e-05, "loss": 3.572, "step": 31800 }, { "epoch": 1.362607321344667, "grad_norm": 5.9545207023620605, "learning_rate": 3.032242243984243e-05, "loss": 3.7374, "step": 31900 }, { "epoch": 1.3668788176498228, "grad_norm": 8.175214767456055, "learning_rate": 3.024331978041102e-05, "loss": 3.624, "step": 32000 }, { "epoch": 1.3668788176498228, "eval_runtime": 403.7487, "eval_samples_per_second": 115.968, "eval_steps_per_second": 14.497, "step": 32000 }, { "epoch": 1.3711503139549786, "grad_norm": 7.345489978790283, "learning_rate": 3.016421712097961e-05, "loss": 3.6508, "step": 32100 }, { "epoch": 1.375421810260134, "grad_norm": 10.301737785339355, "learning_rate": 3.0085114461548198e-05, "loss": 3.5836, "step": 32200 }, { "epoch": 1.3796933065652899, "grad_norm": 8.771992683410645, "learning_rate": 3.000601180211679e-05, "loss": 3.7915, "step": 32300 }, { "epoch": 1.3839648028704454, "grad_norm": 9.168205261230469, "learning_rate": 2.9926909142685378e-05, "loss": 3.6517, "step": 32400 }, { "epoch": 1.3882362991756012, "grad_norm": 7.1654744148254395, "learning_rate": 2.984780648325397e-05, "loss": 3.503, "step": 32500 }, { "epoch": 1.392507795480757, "grad_norm": 8.390599250793457, "learning_rate": 2.9768703823822557e-05, "loss": 3.7276, "step": 32600 }, { "epoch": 1.3967792917859125, "grad_norm": 12.229002952575684, "learning_rate": 2.9689601164391152e-05, "loss": 3.7753, "step": 32700 }, { "epoch": 1.4010507880910683, "grad_norm": 11.649025917053223, "learning_rate": 2.9610498504959737e-05, "loss": 3.6765, "step": 32800 }, { "epoch": 1.4053222843962239, "grad_norm": 8.619730949401855, "learning_rate": 2.9531395845528325e-05, "loss": 3.6508, "step": 32900 }, { "epoch": 1.4095937807013796, "grad_norm": 9.323366165161133, "learning_rate": 2.945229318609692e-05, "loss": 3.7256, "step": 33000 }, { "epoch": 1.4095937807013796, "eval_runtime": 404.2192, "eval_samples_per_second": 115.833, "eval_steps_per_second": 14.48, "step": 33000 }, { "epoch": 1.4138652770065354, "grad_norm": 13.431550979614258, "learning_rate": 2.9373190526665504e-05, "loss": 3.7312, "step": 33100 }, { "epoch": 1.4181367733116912, "grad_norm": 7.221197128295898, "learning_rate": 2.92940878672341e-05, "loss": 3.5233, "step": 33200 }, { "epoch": 1.4224082696168467, "grad_norm": 8.221494674682617, "learning_rate": 2.9214985207802687e-05, "loss": 3.6815, "step": 33300 }, { "epoch": 1.4266797659220025, "grad_norm": 8.996779441833496, "learning_rate": 2.913588254837128e-05, "loss": 3.5838, "step": 33400 }, { "epoch": 1.430951262227158, "grad_norm": 7.899658679962158, "learning_rate": 2.9056779888939867e-05, "loss": 3.5632, "step": 33500 }, { "epoch": 1.4352227585323138, "grad_norm": 7.839208602905273, "learning_rate": 2.897767722950846e-05, "loss": 3.6154, "step": 33600 }, { "epoch": 1.4394942548374696, "grad_norm": 7.053780555725098, "learning_rate": 2.8898574570077047e-05, "loss": 3.6218, "step": 33700 }, { "epoch": 1.4437657511426254, "grad_norm": 11.56430721282959, "learning_rate": 2.8819471910645635e-05, "loss": 3.6301, "step": 33800 }, { "epoch": 1.448037247447781, "grad_norm": 12.948223114013672, "learning_rate": 2.8740369251214226e-05, "loss": 3.6278, "step": 33900 }, { "epoch": 1.4523087437529367, "grad_norm": 10.741262435913086, "learning_rate": 2.8661266591782814e-05, "loss": 3.7338, "step": 34000 }, { "epoch": 1.4523087437529367, "eval_runtime": 404.4425, "eval_samples_per_second": 115.769, "eval_steps_per_second": 14.472, "step": 34000 }, { "epoch": 1.4565802400580923, "grad_norm": 8.808819770812988, "learning_rate": 2.858216393235141e-05, "loss": 3.5755, "step": 34100 }, { "epoch": 1.460851736363248, "grad_norm": 7.9008097648620605, "learning_rate": 2.8503061272919994e-05, "loss": 3.5744, "step": 34200 }, { "epoch": 1.4651232326684038, "grad_norm": 10.011557579040527, "learning_rate": 2.842395861348859e-05, "loss": 3.7014, "step": 34300 }, { "epoch": 1.4693947289735594, "grad_norm": 8.058487892150879, "learning_rate": 2.8344855954057177e-05, "loss": 3.6554, "step": 34400 }, { "epoch": 1.4736662252787152, "grad_norm": 7.3602824211120605, "learning_rate": 2.8265753294625768e-05, "loss": 3.7166, "step": 34500 }, { "epoch": 1.4779377215838707, "grad_norm": 7.900210857391357, "learning_rate": 2.8186650635194356e-05, "loss": 3.6276, "step": 34600 }, { "epoch": 1.4822092178890265, "grad_norm": 7.839376926422119, "learning_rate": 2.8107547975762948e-05, "loss": 3.6036, "step": 34700 }, { "epoch": 1.4864807141941823, "grad_norm": 8.925679206848145, "learning_rate": 2.8028445316331536e-05, "loss": 3.6868, "step": 34800 }, { "epoch": 1.490752210499338, "grad_norm": 8.532880783081055, "learning_rate": 2.7949342656900124e-05, "loss": 3.7388, "step": 34900 }, { "epoch": 1.4950237068044936, "grad_norm": 9.397866249084473, "learning_rate": 2.7870239997468716e-05, "loss": 3.7206, "step": 35000 }, { "epoch": 1.4950237068044936, "eval_runtime": 404.715, "eval_samples_per_second": 115.691, "eval_steps_per_second": 14.462, "step": 35000 }, { "epoch": 1.4992952031096494, "grad_norm": 9.152342796325684, "learning_rate": 2.7791137338037304e-05, "loss": 3.6816, "step": 35100 }, { "epoch": 1.503566699414805, "grad_norm": 7.594329357147217, "learning_rate": 2.77120346786059e-05, "loss": 3.6713, "step": 35200 }, { "epoch": 1.5078381957199607, "grad_norm": 9.826537132263184, "learning_rate": 2.7632932019174483e-05, "loss": 3.6467, "step": 35300 }, { "epoch": 1.5121096920251165, "grad_norm": 9.374577522277832, "learning_rate": 2.7553829359743078e-05, "loss": 3.6588, "step": 35400 }, { "epoch": 1.5163811883302722, "grad_norm": 10.790063858032227, "learning_rate": 2.7474726700311666e-05, "loss": 3.5689, "step": 35500 }, { "epoch": 1.5206526846354278, "grad_norm": 10.145702362060547, "learning_rate": 2.7395624040880258e-05, "loss": 3.7457, "step": 35600 }, { "epoch": 1.5249241809405834, "grad_norm": 11.168187141418457, "learning_rate": 2.7316521381448846e-05, "loss": 3.5706, "step": 35700 }, { "epoch": 1.5291956772457391, "grad_norm": 9.234560012817383, "learning_rate": 2.7237418722017437e-05, "loss": 3.6744, "step": 35800 }, { "epoch": 1.533467173550895, "grad_norm": 10.015559196472168, "learning_rate": 2.7158316062586025e-05, "loss": 3.6374, "step": 35900 }, { "epoch": 1.5377386698560507, "grad_norm": 8.472687721252441, "learning_rate": 2.7079213403154613e-05, "loss": 3.6958, "step": 36000 }, { "epoch": 1.5377386698560507, "eval_runtime": 404.1589, "eval_samples_per_second": 115.85, "eval_steps_per_second": 14.482, "step": 36000 }, { "epoch": 1.5420101661612062, "grad_norm": 7.5909199714660645, "learning_rate": 2.7000110743723205e-05, "loss": 3.6096, "step": 36100 }, { "epoch": 1.546281662466362, "grad_norm": 18.598318099975586, "learning_rate": 2.6921008084291793e-05, "loss": 3.6368, "step": 36200 }, { "epoch": 1.5505531587715176, "grad_norm": 10.265989303588867, "learning_rate": 2.6841905424860388e-05, "loss": 3.6886, "step": 36300 }, { "epoch": 1.5548246550766733, "grad_norm": 16.7838077545166, "learning_rate": 2.6762802765428973e-05, "loss": 3.4999, "step": 36400 }, { "epoch": 1.559096151381829, "grad_norm": 9.542481422424316, "learning_rate": 2.6683700105997567e-05, "loss": 3.5173, "step": 36500 }, { "epoch": 1.5633676476869849, "grad_norm": 7.0144758224487305, "learning_rate": 2.6604597446566156e-05, "loss": 3.6004, "step": 36600 }, { "epoch": 1.5676391439921404, "grad_norm": 7.273271560668945, "learning_rate": 2.6525494787134747e-05, "loss": 3.5539, "step": 36700 }, { "epoch": 1.571910640297296, "grad_norm": 9.942744255065918, "learning_rate": 2.6446392127703335e-05, "loss": 3.6823, "step": 36800 }, { "epoch": 1.5761821366024518, "grad_norm": 8.686135292053223, "learning_rate": 2.6367289468271923e-05, "loss": 3.6757, "step": 36900 }, { "epoch": 1.5804536329076075, "grad_norm": 6.468233108520508, "learning_rate": 2.6288186808840515e-05, "loss": 3.6318, "step": 37000 }, { "epoch": 1.5804536329076075, "eval_runtime": 403.714, "eval_samples_per_second": 115.978, "eval_steps_per_second": 14.498, "step": 37000 }, { "epoch": 1.5847251292127633, "grad_norm": 8.390809059143066, "learning_rate": 2.6209084149409103e-05, "loss": 3.6048, "step": 37100 }, { "epoch": 1.588996625517919, "grad_norm": 11.824224472045898, "learning_rate": 2.6129981489977694e-05, "loss": 3.6128, "step": 37200 }, { "epoch": 1.5932681218230746, "grad_norm": 9.557259559631348, "learning_rate": 2.6050878830546282e-05, "loss": 3.5252, "step": 37300 }, { "epoch": 1.5975396181282302, "grad_norm": 10.761728286743164, "learning_rate": 2.5971776171114874e-05, "loss": 3.6447, "step": 37400 }, { "epoch": 1.601811114433386, "grad_norm": 7.978828430175781, "learning_rate": 2.5892673511683462e-05, "loss": 3.6211, "step": 37500 }, { "epoch": 1.6060826107385417, "grad_norm": 8.314446449279785, "learning_rate": 2.5813570852252057e-05, "loss": 3.6197, "step": 37600 }, { "epoch": 1.6103541070436975, "grad_norm": 7.391338348388672, "learning_rate": 2.573446819282064e-05, "loss": 3.6123, "step": 37700 }, { "epoch": 1.614625603348853, "grad_norm": 9.402429580688477, "learning_rate": 2.5655365533389236e-05, "loss": 3.7008, "step": 37800 }, { "epoch": 1.6188970996540089, "grad_norm": 9.703052520751953, "learning_rate": 2.5576262873957825e-05, "loss": 3.6748, "step": 37900 }, { "epoch": 1.6231685959591644, "grad_norm": 7.890733242034912, "learning_rate": 2.5497160214526413e-05, "loss": 3.6766, "step": 38000 }, { "epoch": 1.6231685959591644, "eval_runtime": 403.3234, "eval_samples_per_second": 116.09, "eval_steps_per_second": 14.512, "step": 38000 }, { "epoch": 1.6274400922643202, "grad_norm": 7.096985816955566, "learning_rate": 2.5418057555095004e-05, "loss": 3.6308, "step": 38100 }, { "epoch": 1.631711588569476, "grad_norm": 10.07547378540039, "learning_rate": 2.5338954895663592e-05, "loss": 3.5905, "step": 38200 }, { "epoch": 1.6359830848746317, "grad_norm": 8.68416690826416, "learning_rate": 2.5259852236232184e-05, "loss": 3.5352, "step": 38300 }, { "epoch": 1.6402545811797873, "grad_norm": 10.171316146850586, "learning_rate": 2.5180749576800772e-05, "loss": 3.6892, "step": 38400 }, { "epoch": 1.6445260774849428, "grad_norm": 12.549208641052246, "learning_rate": 2.5101646917369363e-05, "loss": 3.6036, "step": 38500 }, { "epoch": 1.6487975737900986, "grad_norm": 9.339801788330078, "learning_rate": 2.502254425793795e-05, "loss": 3.5937, "step": 38600 }, { "epoch": 1.6530690700952544, "grad_norm": 7.933904647827148, "learning_rate": 2.4943441598506543e-05, "loss": 3.588, "step": 38700 }, { "epoch": 1.6573405664004102, "grad_norm": 11.310770988464355, "learning_rate": 2.486433893907513e-05, "loss": 3.7504, "step": 38800 }, { "epoch": 1.6616120627055657, "grad_norm": 9.128674507141113, "learning_rate": 2.4785236279643722e-05, "loss": 3.65, "step": 38900 }, { "epoch": 1.6658835590107215, "grad_norm": 10.278397560119629, "learning_rate": 2.4706133620212314e-05, "loss": 3.7471, "step": 39000 }, { "epoch": 1.6658835590107215, "eval_runtime": 403.4984, "eval_samples_per_second": 116.04, "eval_steps_per_second": 14.506, "step": 39000 }, { "epoch": 1.670155055315877, "grad_norm": 9.706366539001465, "learning_rate": 2.4627030960780902e-05, "loss": 3.6192, "step": 39100 }, { "epoch": 1.6744265516210328, "grad_norm": 8.632245063781738, "learning_rate": 2.4547928301349494e-05, "loss": 3.5752, "step": 39200 }, { "epoch": 1.6786980479261886, "grad_norm": 8.402227401733398, "learning_rate": 2.4468825641918085e-05, "loss": 3.6774, "step": 39300 }, { "epoch": 1.6829695442313444, "grad_norm": 8.275464057922363, "learning_rate": 2.4389722982486673e-05, "loss": 3.6148, "step": 39400 }, { "epoch": 1.6872410405365, "grad_norm": 9.17482852935791, "learning_rate": 2.431062032305526e-05, "loss": 3.6413, "step": 39500 }, { "epoch": 1.6915125368416555, "grad_norm": 8.914527893066406, "learning_rate": 2.4231517663623853e-05, "loss": 3.7191, "step": 39600 }, { "epoch": 1.6957840331468113, "grad_norm": 8.066243171691895, "learning_rate": 2.415241500419244e-05, "loss": 3.5606, "step": 39700 }, { "epoch": 1.700055529451967, "grad_norm": 9.488569259643555, "learning_rate": 2.4073312344761032e-05, "loss": 3.6873, "step": 39800 }, { "epoch": 1.7043270257571228, "grad_norm": 9.717203140258789, "learning_rate": 2.399420968532962e-05, "loss": 3.688, "step": 39900 }, { "epoch": 1.7085985220622786, "grad_norm": 8.048073768615723, "learning_rate": 2.3915107025898212e-05, "loss": 3.515, "step": 40000 }, { "epoch": 1.7085985220622786, "eval_runtime": 404.1323, "eval_samples_per_second": 115.858, "eval_steps_per_second": 14.483, "step": 40000 }, { "epoch": 1.7128700183674341, "grad_norm": 9.101920127868652, "learning_rate": 2.3836004366466803e-05, "loss": 3.6674, "step": 40100 }, { "epoch": 1.7171415146725897, "grad_norm": 6.701783180236816, "learning_rate": 2.375690170703539e-05, "loss": 3.6339, "step": 40200 }, { "epoch": 1.7214130109777455, "grad_norm": 9.65266227722168, "learning_rate": 2.3677799047603983e-05, "loss": 3.5557, "step": 40300 }, { "epoch": 1.7256845072829012, "grad_norm": 9.488314628601074, "learning_rate": 2.359869638817257e-05, "loss": 3.7312, "step": 40400 }, { "epoch": 1.729956003588057, "grad_norm": 8.73523235321045, "learning_rate": 2.3519593728741163e-05, "loss": 3.6714, "step": 40500 }, { "epoch": 1.7342274998932126, "grad_norm": 9.438526153564453, "learning_rate": 2.344049106930975e-05, "loss": 3.6664, "step": 40600 }, { "epoch": 1.7384989961983683, "grad_norm": 9.409259796142578, "learning_rate": 2.336138840987834e-05, "loss": 3.5598, "step": 40700 }, { "epoch": 1.742770492503524, "grad_norm": 6.831430435180664, "learning_rate": 2.328228575044693e-05, "loss": 3.7215, "step": 40800 }, { "epoch": 1.7470419888086797, "grad_norm": 8.387484550476074, "learning_rate": 2.320318309101552e-05, "loss": 3.578, "step": 40900 }, { "epoch": 1.7513134851138354, "grad_norm": 9.247336387634277, "learning_rate": 2.312408043158411e-05, "loss": 3.615, "step": 41000 }, { "epoch": 1.7513134851138354, "eval_runtime": 404.0825, "eval_samples_per_second": 115.872, "eval_steps_per_second": 14.485, "step": 41000 }, { "epoch": 1.7555849814189912, "grad_norm": 11.280122756958008, "learning_rate": 2.30449777721527e-05, "loss": 3.5713, "step": 41100 }, { "epoch": 1.7598564777241468, "grad_norm": 8.902118682861328, "learning_rate": 2.2965875112721293e-05, "loss": 3.6999, "step": 41200 }, { "epoch": 1.7641279740293023, "grad_norm": 11.0384521484375, "learning_rate": 2.288677245328988e-05, "loss": 3.7134, "step": 41300 }, { "epoch": 1.768399470334458, "grad_norm": 8.986517906188965, "learning_rate": 2.2807669793858472e-05, "loss": 3.5391, "step": 41400 }, { "epoch": 1.7726709666396139, "grad_norm": 9.237929344177246, "learning_rate": 2.272856713442706e-05, "loss": 3.781, "step": 41500 }, { "epoch": 1.7769424629447697, "grad_norm": 12.143738746643066, "learning_rate": 2.264946447499565e-05, "loss": 3.5613, "step": 41600 }, { "epoch": 1.7812139592499252, "grad_norm": 9.296298027038574, "learning_rate": 2.257036181556424e-05, "loss": 3.6645, "step": 41700 }, { "epoch": 1.785485455555081, "grad_norm": 9.721207618713379, "learning_rate": 2.2491259156132828e-05, "loss": 3.5764, "step": 41800 }, { "epoch": 1.7897569518602365, "grad_norm": 11.145936012268066, "learning_rate": 2.241215649670142e-05, "loss": 3.5321, "step": 41900 }, { "epoch": 1.7940284481653923, "grad_norm": 10.27043628692627, "learning_rate": 2.233305383727001e-05, "loss": 3.6625, "step": 42000 }, { "epoch": 1.7940284481653923, "eval_runtime": 405.1907, "eval_samples_per_second": 115.555, "eval_steps_per_second": 14.445, "step": 42000 }, { "epoch": 1.798299944470548, "grad_norm": 10.281463623046875, "learning_rate": 2.22539511778386e-05, "loss": 3.5566, "step": 42100 }, { "epoch": 1.8025714407757039, "grad_norm": 6.728999614715576, "learning_rate": 2.217484851840719e-05, "loss": 3.5608, "step": 42200 }, { "epoch": 1.8068429370808594, "grad_norm": 6.053191184997559, "learning_rate": 2.209574585897578e-05, "loss": 3.685, "step": 42300 }, { "epoch": 1.811114433386015, "grad_norm": 8.071969032287598, "learning_rate": 2.201664319954437e-05, "loss": 3.6003, "step": 42400 }, { "epoch": 1.8153859296911707, "grad_norm": 29.326370239257812, "learning_rate": 2.1937540540112962e-05, "loss": 3.6615, "step": 42500 }, { "epoch": 1.8196574259963265, "grad_norm": 8.652432441711426, "learning_rate": 2.185843788068155e-05, "loss": 3.5731, "step": 42600 }, { "epoch": 1.8239289223014823, "grad_norm": 11.717292785644531, "learning_rate": 2.1779335221250138e-05, "loss": 3.6371, "step": 42700 }, { "epoch": 1.828200418606638, "grad_norm": 10.365557670593262, "learning_rate": 2.170023256181873e-05, "loss": 3.6857, "step": 42800 }, { "epoch": 1.8324719149117936, "grad_norm": 12.400829315185547, "learning_rate": 2.1621129902387317e-05, "loss": 3.6896, "step": 42900 }, { "epoch": 1.8367434112169492, "grad_norm": 8.40799331665039, "learning_rate": 2.154202724295591e-05, "loss": 3.6611, "step": 43000 }, { "epoch": 1.8367434112169492, "eval_runtime": 403.8313, "eval_samples_per_second": 115.944, "eval_steps_per_second": 14.494, "step": 43000 }, { "epoch": 1.841014907522105, "grad_norm": 10.518604278564453, "learning_rate": 2.14629245835245e-05, "loss": 3.6118, "step": 43100 }, { "epoch": 1.8452864038272607, "grad_norm": 7.877737998962402, "learning_rate": 2.138382192409309e-05, "loss": 3.5943, "step": 43200 }, { "epoch": 1.8495579001324165, "grad_norm": 12.722783088684082, "learning_rate": 2.130471926466168e-05, "loss": 3.6583, "step": 43300 }, { "epoch": 1.853829396437572, "grad_norm": 8.382994651794434, "learning_rate": 2.1225616605230268e-05, "loss": 3.5931, "step": 43400 }, { "epoch": 1.8581008927427278, "grad_norm": 10.603730201721191, "learning_rate": 2.114651394579886e-05, "loss": 3.6558, "step": 43500 }, { "epoch": 1.8623723890478834, "grad_norm": 6.978638172149658, "learning_rate": 2.106741128636745e-05, "loss": 3.504, "step": 43600 }, { "epoch": 1.8666438853530392, "grad_norm": 7.777115345001221, "learning_rate": 2.0988308626936036e-05, "loss": 3.7573, "step": 43700 }, { "epoch": 1.870915381658195, "grad_norm": 8.054482460021973, "learning_rate": 2.0909205967504627e-05, "loss": 3.5624, "step": 43800 }, { "epoch": 1.8751868779633507, "grad_norm": 8.191532135009766, "learning_rate": 2.083010330807322e-05, "loss": 3.6115, "step": 43900 }, { "epoch": 1.8794583742685063, "grad_norm": 9.908390998840332, "learning_rate": 2.0751000648641807e-05, "loss": 3.6564, "step": 44000 }, { "epoch": 1.8794583742685063, "eval_runtime": 403.8168, "eval_samples_per_second": 115.949, "eval_steps_per_second": 14.494, "step": 44000 }, { "epoch": 1.8837298705736618, "grad_norm": 10.703449249267578, "learning_rate": 2.06718979892104e-05, "loss": 3.6265, "step": 44100 }, { "epoch": 1.8880013668788176, "grad_norm": 8.703311920166016, "learning_rate": 2.059279532977899e-05, "loss": 3.601, "step": 44200 }, { "epoch": 1.8922728631839734, "grad_norm": 16.844961166381836, "learning_rate": 2.0513692670347578e-05, "loss": 3.5735, "step": 44300 }, { "epoch": 1.8965443594891291, "grad_norm": 7.944665908813477, "learning_rate": 2.043459001091617e-05, "loss": 3.6514, "step": 44400 }, { "epoch": 1.9008158557942847, "grad_norm": 10.938014030456543, "learning_rate": 2.0355487351484758e-05, "loss": 3.6739, "step": 44500 }, { "epoch": 1.9050873520994405, "grad_norm": 7.884680271148682, "learning_rate": 2.027638469205335e-05, "loss": 3.6705, "step": 44600 }, { "epoch": 1.909358848404596, "grad_norm": 10.993422508239746, "learning_rate": 2.0197282032621937e-05, "loss": 3.5416, "step": 44700 }, { "epoch": 1.9136303447097518, "grad_norm": 9.719098091125488, "learning_rate": 2.0118179373190525e-05, "loss": 3.6548, "step": 44800 }, { "epoch": 1.9179018410149076, "grad_norm": 9.458189964294434, "learning_rate": 2.0039076713759117e-05, "loss": 3.7504, "step": 44900 }, { "epoch": 1.9221733373200633, "grad_norm": 10.599435806274414, "learning_rate": 1.9959974054327708e-05, "loss": 3.5734, "step": 45000 }, { "epoch": 1.9221733373200633, "eval_runtime": 404.0106, "eval_samples_per_second": 115.893, "eval_steps_per_second": 14.487, "step": 45000 }, { "epoch": 1.926444833625219, "grad_norm": 9.23690128326416, "learning_rate": 1.9880871394896296e-05, "loss": 3.7208, "step": 45100 }, { "epoch": 1.9307163299303745, "grad_norm": 7.124606609344482, "learning_rate": 1.9801768735464888e-05, "loss": 3.6351, "step": 45200 }, { "epoch": 1.9349878262355302, "grad_norm": 8.71446704864502, "learning_rate": 1.9722666076033476e-05, "loss": 3.6835, "step": 45300 }, { "epoch": 1.939259322540686, "grad_norm": 9.558823585510254, "learning_rate": 1.9643563416602067e-05, "loss": 3.5569, "step": 45400 }, { "epoch": 1.9435308188458418, "grad_norm": 9.622088432312012, "learning_rate": 1.956446075717066e-05, "loss": 3.6797, "step": 45500 }, { "epoch": 1.9478023151509976, "grad_norm": 8.641619682312012, "learning_rate": 1.9485358097739247e-05, "loss": 3.6377, "step": 45600 }, { "epoch": 1.952073811456153, "grad_norm": 12.308704376220703, "learning_rate": 1.940625543830784e-05, "loss": 3.5211, "step": 45700 }, { "epoch": 1.9563453077613087, "grad_norm": 8.850275993347168, "learning_rate": 1.9327152778876426e-05, "loss": 3.5652, "step": 45800 }, { "epoch": 1.9606168040664644, "grad_norm": 8.595603942871094, "learning_rate": 1.9248050119445015e-05, "loss": 3.6181, "step": 45900 }, { "epoch": 1.9648883003716202, "grad_norm": 8.737709999084473, "learning_rate": 1.9168947460013606e-05, "loss": 3.6392, "step": 46000 }, { "epoch": 1.9648883003716202, "eval_runtime": 403.5624, "eval_samples_per_second": 116.022, "eval_steps_per_second": 14.503, "step": 46000 }, { "epoch": 1.969159796676776, "grad_norm": 10.178166389465332, "learning_rate": 1.9089844800582198e-05, "loss": 3.5406, "step": 46100 }, { "epoch": 1.9734312929819315, "grad_norm": 8.49496841430664, "learning_rate": 1.9010742141150786e-05, "loss": 3.5631, "step": 46200 }, { "epoch": 1.9777027892870873, "grad_norm": 12.1917724609375, "learning_rate": 1.8931639481719377e-05, "loss": 3.6878, "step": 46300 }, { "epoch": 1.9819742855922429, "grad_norm": 7.169999599456787, "learning_rate": 1.8852536822287965e-05, "loss": 3.5653, "step": 46400 }, { "epoch": 1.9862457818973986, "grad_norm": 9.828686714172363, "learning_rate": 1.8773434162856557e-05, "loss": 3.5959, "step": 46500 }, { "epoch": 1.9905172782025544, "grad_norm": 11.669685363769531, "learning_rate": 1.8694331503425148e-05, "loss": 3.7558, "step": 46600 }, { "epoch": 1.9947887745077102, "grad_norm": 9.722572326660156, "learning_rate": 1.8615228843993736e-05, "loss": 3.5793, "step": 46700 }, { "epoch": 1.9990602708128657, "grad_norm": 7.060891151428223, "learning_rate": 1.8536126184562324e-05, "loss": 3.5613, "step": 46800 }, { "epoch": 2.0033317671180213, "grad_norm": 7.597713470458984, "learning_rate": 1.8457023525130916e-05, "loss": 3.527, "step": 46900 }, { "epoch": 2.007603263423177, "grad_norm": 8.622049331665039, "learning_rate": 1.8377920865699504e-05, "loss": 3.576, "step": 47000 }, { "epoch": 2.007603263423177, "eval_runtime": 404.752, "eval_samples_per_second": 115.681, "eval_steps_per_second": 14.461, "step": 47000 }, { "epoch": 2.011874759728333, "grad_norm": 7.117955207824707, "learning_rate": 1.8298818206268095e-05, "loss": 3.6644, "step": 47100 }, { "epoch": 2.0161462560334886, "grad_norm": 7.748778820037842, "learning_rate": 1.8219715546836687e-05, "loss": 3.5904, "step": 47200 }, { "epoch": 2.0204177523386444, "grad_norm": 7.402785301208496, "learning_rate": 1.8140612887405275e-05, "loss": 3.6069, "step": 47300 }, { "epoch": 2.0246892486437997, "grad_norm": 7.453569412231445, "learning_rate": 1.8061510227973867e-05, "loss": 3.6795, "step": 47400 }, { "epoch": 2.0289607449489555, "grad_norm": 8.299507141113281, "learning_rate": 1.7982407568542455e-05, "loss": 3.6194, "step": 47500 }, { "epoch": 2.0332322412541113, "grad_norm": 10.050152778625488, "learning_rate": 1.7903304909111046e-05, "loss": 3.5512, "step": 47600 }, { "epoch": 2.037503737559267, "grad_norm": 8.691873550415039, "learning_rate": 1.7824202249679638e-05, "loss": 3.6216, "step": 47700 }, { "epoch": 2.041775233864423, "grad_norm": 7.912090301513672, "learning_rate": 1.7745099590248222e-05, "loss": 3.5601, "step": 47800 }, { "epoch": 2.0460467301695786, "grad_norm": 9.80728530883789, "learning_rate": 1.7665996930816814e-05, "loss": 3.6074, "step": 47900 }, { "epoch": 2.050318226474734, "grad_norm": 11.86419677734375, "learning_rate": 1.7586894271385405e-05, "loss": 3.5964, "step": 48000 }, { "epoch": 2.050318226474734, "eval_runtime": 403.5223, "eval_samples_per_second": 116.033, "eval_steps_per_second": 14.505, "step": 48000 }, { "epoch": 2.0545897227798897, "grad_norm": 8.644769668579102, "learning_rate": 1.7507791611953993e-05, "loss": 3.53, "step": 48100 }, { "epoch": 2.0588612190850455, "grad_norm": 8.596597671508789, "learning_rate": 1.7428688952522585e-05, "loss": 3.6423, "step": 48200 }, { "epoch": 2.0631327153902013, "grad_norm": 8.68507194519043, "learning_rate": 1.7349586293091173e-05, "loss": 3.5187, "step": 48300 }, { "epoch": 2.067404211695357, "grad_norm": 12.417092323303223, "learning_rate": 1.7270483633659764e-05, "loss": 3.6873, "step": 48400 }, { "epoch": 2.0716757080005124, "grad_norm": 7.873465061187744, "learning_rate": 1.7191380974228356e-05, "loss": 3.556, "step": 48500 }, { "epoch": 2.075947204305668, "grad_norm": 9.485852241516113, "learning_rate": 1.7112278314796944e-05, "loss": 3.5671, "step": 48600 }, { "epoch": 2.080218700610824, "grad_norm": 9.282876968383789, "learning_rate": 1.7033175655365536e-05, "loss": 3.5649, "step": 48700 }, { "epoch": 2.0844901969159797, "grad_norm": 9.663043022155762, "learning_rate": 1.6954072995934127e-05, "loss": 3.5675, "step": 48800 }, { "epoch": 2.0887616932211355, "grad_norm": 9.47641372680664, "learning_rate": 1.6874970336502712e-05, "loss": 3.5404, "step": 48900 }, { "epoch": 2.0930331895262912, "grad_norm": 9.768278121948242, "learning_rate": 1.6795867677071303e-05, "loss": 3.6144, "step": 49000 }, { "epoch": 2.0930331895262912, "eval_runtime": 403.9425, "eval_samples_per_second": 115.913, "eval_steps_per_second": 14.49, "step": 49000 }, { "epoch": 2.0973046858314466, "grad_norm": 9.314282417297363, "learning_rate": 1.6716765017639895e-05, "loss": 3.6568, "step": 49100 }, { "epoch": 2.1015761821366024, "grad_norm": 8.707430839538574, "learning_rate": 1.6637662358208483e-05, "loss": 3.5775, "step": 49200 }, { "epoch": 2.105847678441758, "grad_norm": 11.704259872436523, "learning_rate": 1.6558559698777074e-05, "loss": 3.5568, "step": 49300 }, { "epoch": 2.110119174746914, "grad_norm": 8.504453659057617, "learning_rate": 1.6479457039345662e-05, "loss": 3.6528, "step": 49400 }, { "epoch": 2.1143906710520697, "grad_norm": 8.935593605041504, "learning_rate": 1.6400354379914254e-05, "loss": 3.7016, "step": 49500 }, { "epoch": 2.1186621673572255, "grad_norm": 8.349204063415527, "learning_rate": 1.6321251720482845e-05, "loss": 3.5431, "step": 49600 }, { "epoch": 2.122933663662381, "grad_norm": 11.8608980178833, "learning_rate": 1.6242149061051433e-05, "loss": 3.5844, "step": 49700 }, { "epoch": 2.1272051599675366, "grad_norm": 7.555705547332764, "learning_rate": 1.6163046401620025e-05, "loss": 3.5815, "step": 49800 }, { "epoch": 2.1314766562726923, "grad_norm": 9.529816627502441, "learning_rate": 1.6083943742188613e-05, "loss": 3.5485, "step": 49900 }, { "epoch": 2.135748152577848, "grad_norm": 9.32353401184082, "learning_rate": 1.60048410827572e-05, "loss": 3.589, "step": 50000 }, { "epoch": 2.135748152577848, "eval_runtime": 404.1831, "eval_samples_per_second": 115.844, "eval_steps_per_second": 14.481, "step": 50000 }, { "epoch": 2.140019648883004, "grad_norm": 8.285137176513672, "learning_rate": 1.5925738423325793e-05, "loss": 3.6874, "step": 50100 }, { "epoch": 2.144291145188159, "grad_norm": 6.94751501083374, "learning_rate": 1.5846635763894384e-05, "loss": 3.6489, "step": 50200 }, { "epoch": 2.148562641493315, "grad_norm": 11.093490600585938, "learning_rate": 1.5767533104462972e-05, "loss": 3.6675, "step": 50300 }, { "epoch": 2.1528341377984708, "grad_norm": 8.154306411743164, "learning_rate": 1.5688430445031564e-05, "loss": 3.5934, "step": 50400 }, { "epoch": 2.1571056341036265, "grad_norm": 8.806336402893066, "learning_rate": 1.5609327785600152e-05, "loss": 3.5804, "step": 50500 }, { "epoch": 2.1613771304087823, "grad_norm": 10.496975898742676, "learning_rate": 1.5530225126168743e-05, "loss": 3.6913, "step": 50600 }, { "epoch": 2.165648626713938, "grad_norm": 9.081565856933594, "learning_rate": 1.5451122466737335e-05, "loss": 3.7297, "step": 50700 }, { "epoch": 2.1699201230190934, "grad_norm": 7.850902557373047, "learning_rate": 1.5372019807305923e-05, "loss": 3.7112, "step": 50800 }, { "epoch": 2.174191619324249, "grad_norm": 8.145720481872559, "learning_rate": 1.529291714787451e-05, "loss": 3.6324, "step": 50900 }, { "epoch": 2.178463115629405, "grad_norm": 8.924689292907715, "learning_rate": 1.52138144884431e-05, "loss": 3.6598, "step": 51000 }, { "epoch": 2.178463115629405, "eval_runtime": 404.5353, "eval_samples_per_second": 115.743, "eval_steps_per_second": 14.468, "step": 51000 }, { "epoch": 2.1827346119345608, "grad_norm": 13.303974151611328, "learning_rate": 1.513471182901169e-05, "loss": 3.5284, "step": 51100 }, { "epoch": 2.1870061082397165, "grad_norm": 8.976105690002441, "learning_rate": 1.5055609169580282e-05, "loss": 3.6514, "step": 51200 }, { "epoch": 2.1912776045448723, "grad_norm": 7.439825057983398, "learning_rate": 1.4976506510148872e-05, "loss": 3.5687, "step": 51300 }, { "epoch": 2.1955491008500276, "grad_norm": 8.54857349395752, "learning_rate": 1.4897403850717462e-05, "loss": 3.7166, "step": 51400 }, { "epoch": 2.1998205971551834, "grad_norm": 11.23593521118164, "learning_rate": 1.4818301191286051e-05, "loss": 3.5591, "step": 51500 }, { "epoch": 2.204092093460339, "grad_norm": 13.313474655151367, "learning_rate": 1.4739198531854643e-05, "loss": 3.7213, "step": 51600 }, { "epoch": 2.208363589765495, "grad_norm": 9.998103141784668, "learning_rate": 1.4660095872423233e-05, "loss": 3.5843, "step": 51700 }, { "epoch": 2.2126350860706507, "grad_norm": 8.799863815307617, "learning_rate": 1.4580993212991822e-05, "loss": 3.7109, "step": 51800 }, { "epoch": 2.216906582375806, "grad_norm": 7.352701187133789, "learning_rate": 1.4501890553560412e-05, "loss": 3.6722, "step": 51900 }, { "epoch": 2.221178078680962, "grad_norm": 12.166138648986816, "learning_rate": 1.4422787894129e-05, "loss": 3.582, "step": 52000 }, { "epoch": 2.221178078680962, "eval_runtime": 403.5081, "eval_samples_per_second": 116.037, "eval_steps_per_second": 14.505, "step": 52000 }, { "epoch": 2.2254495749861176, "grad_norm": 10.341227531433105, "learning_rate": 1.434368523469759e-05, "loss": 3.7105, "step": 52100 }, { "epoch": 2.2297210712912734, "grad_norm": 7.697736740112305, "learning_rate": 1.426458257526618e-05, "loss": 3.5896, "step": 52200 }, { "epoch": 2.233992567596429, "grad_norm": 7.957235336303711, "learning_rate": 1.4185479915834771e-05, "loss": 3.5472, "step": 52300 }, { "epoch": 2.2382640639015845, "grad_norm": 7.778316020965576, "learning_rate": 1.4106377256403361e-05, "loss": 3.5998, "step": 52400 }, { "epoch": 2.2425355602067403, "grad_norm": 8.099467277526855, "learning_rate": 1.4027274596971951e-05, "loss": 3.7143, "step": 52500 }, { "epoch": 2.246807056511896, "grad_norm": 8.077199935913086, "learning_rate": 1.394817193754054e-05, "loss": 3.6727, "step": 52600 }, { "epoch": 2.251078552817052, "grad_norm": 10.278371810913086, "learning_rate": 1.3869069278109132e-05, "loss": 3.638, "step": 52700 }, { "epoch": 2.2553500491222076, "grad_norm": 10.49933910369873, "learning_rate": 1.3789966618677722e-05, "loss": 3.5718, "step": 52800 }, { "epoch": 2.2596215454273634, "grad_norm": 10.37414264678955, "learning_rate": 1.3710863959246312e-05, "loss": 3.551, "step": 52900 }, { "epoch": 2.2638930417325187, "grad_norm": 6.969189643859863, "learning_rate": 1.36317612998149e-05, "loss": 3.593, "step": 53000 }, { "epoch": 2.2638930417325187, "eval_runtime": 404.0848, "eval_samples_per_second": 115.872, "eval_steps_per_second": 14.485, "step": 53000 }, { "epoch": 2.2681645380376745, "grad_norm": 7.354485511779785, "learning_rate": 1.355265864038349e-05, "loss": 3.5927, "step": 53100 }, { "epoch": 2.2724360343428303, "grad_norm": 10.107403755187988, "learning_rate": 1.347355598095208e-05, "loss": 3.6626, "step": 53200 }, { "epoch": 2.276707530647986, "grad_norm": 9.613969802856445, "learning_rate": 1.339445332152067e-05, "loss": 3.6073, "step": 53300 }, { "epoch": 2.280979026953142, "grad_norm": 7.995043754577637, "learning_rate": 1.3315350662089259e-05, "loss": 3.5675, "step": 53400 }, { "epoch": 2.2852505232582976, "grad_norm": 7.049370765686035, "learning_rate": 1.323624800265785e-05, "loss": 3.6559, "step": 53500 }, { "epoch": 2.289522019563453, "grad_norm": 10.962531089782715, "learning_rate": 1.315714534322644e-05, "loss": 3.667, "step": 53600 }, { "epoch": 2.2937935158686087, "grad_norm": 8.100302696228027, "learning_rate": 1.307804268379503e-05, "loss": 3.6044, "step": 53700 }, { "epoch": 2.2980650121737645, "grad_norm": 8.079455375671387, "learning_rate": 1.299894002436362e-05, "loss": 3.7206, "step": 53800 }, { "epoch": 2.3023365084789202, "grad_norm": 8.500101089477539, "learning_rate": 1.2919837364932211e-05, "loss": 3.6075, "step": 53900 }, { "epoch": 2.306608004784076, "grad_norm": 12.927189826965332, "learning_rate": 1.2840734705500801e-05, "loss": 3.654, "step": 54000 }, { "epoch": 2.306608004784076, "eval_runtime": 403.7677, "eval_samples_per_second": 115.963, "eval_steps_per_second": 14.496, "step": 54000 }, { "epoch": 2.3108795010892313, "grad_norm": 7.132537841796875, "learning_rate": 1.2761632046069388e-05, "loss": 3.6083, "step": 54100 }, { "epoch": 2.315150997394387, "grad_norm": 8.612030982971191, "learning_rate": 1.2682529386637979e-05, "loss": 3.5848, "step": 54200 }, { "epoch": 2.319422493699543, "grad_norm": 8.210352897644043, "learning_rate": 1.2603426727206569e-05, "loss": 3.7611, "step": 54300 }, { "epoch": 2.3236939900046987, "grad_norm": 9.12792682647705, "learning_rate": 1.2524324067775159e-05, "loss": 3.518, "step": 54400 }, { "epoch": 2.3279654863098544, "grad_norm": 9.999760627746582, "learning_rate": 1.2445221408343748e-05, "loss": 3.5513, "step": 54500 }, { "epoch": 2.3322369826150102, "grad_norm": 12.578611373901367, "learning_rate": 1.236611874891234e-05, "loss": 3.5614, "step": 54600 }, { "epoch": 2.3365084789201656, "grad_norm": 12.089159965515137, "learning_rate": 1.228701608948093e-05, "loss": 3.5944, "step": 54700 }, { "epoch": 2.3407799752253213, "grad_norm": 7.965277671813965, "learning_rate": 1.2207913430049518e-05, "loss": 3.4852, "step": 54800 }, { "epoch": 2.345051471530477, "grad_norm": 10.866728782653809, "learning_rate": 1.2128810770618108e-05, "loss": 3.5431, "step": 54900 }, { "epoch": 2.349322967835633, "grad_norm": 10.489164352416992, "learning_rate": 1.2049708111186699e-05, "loss": 3.4309, "step": 55000 }, { "epoch": 2.349322967835633, "eval_runtime": 404.0413, "eval_samples_per_second": 115.884, "eval_steps_per_second": 14.486, "step": 55000 }, { "epoch": 2.3535944641407887, "grad_norm": 8.494600296020508, "learning_rate": 1.1970605451755289e-05, "loss": 3.4994, "step": 55100 }, { "epoch": 2.3578659604459444, "grad_norm": 9.755086898803711, "learning_rate": 1.1891502792323879e-05, "loss": 3.5679, "step": 55200 }, { "epoch": 2.3621374567510998, "grad_norm": 9.621931076049805, "learning_rate": 1.1812400132892468e-05, "loss": 3.6264, "step": 55300 }, { "epoch": 2.3664089530562555, "grad_norm": 12.946763038635254, "learning_rate": 1.1733297473461058e-05, "loss": 3.5741, "step": 55400 }, { "epoch": 2.3706804493614113, "grad_norm": 8.984336853027344, "learning_rate": 1.1654194814029648e-05, "loss": 3.5317, "step": 55500 }, { "epoch": 2.374951945666567, "grad_norm": 9.304176330566406, "learning_rate": 1.1575092154598238e-05, "loss": 3.6013, "step": 55600 }, { "epoch": 2.379223441971723, "grad_norm": 8.324792861938477, "learning_rate": 1.1495989495166828e-05, "loss": 3.5887, "step": 55700 }, { "epoch": 2.383494938276878, "grad_norm": 11.814824104309082, "learning_rate": 1.1416886835735419e-05, "loss": 3.4865, "step": 55800 }, { "epoch": 2.387766434582034, "grad_norm": 9.219450950622559, "learning_rate": 1.1337784176304007e-05, "loss": 3.5598, "step": 55900 }, { "epoch": 2.3920379308871897, "grad_norm": 10.202199935913086, "learning_rate": 1.1258681516872597e-05, "loss": 3.6973, "step": 56000 }, { "epoch": 2.3920379308871897, "eval_runtime": 405.1455, "eval_samples_per_second": 115.568, "eval_steps_per_second": 14.447, "step": 56000 }, { "epoch": 2.3963094271923455, "grad_norm": 10.352853775024414, "learning_rate": 1.1179578857441188e-05, "loss": 3.5715, "step": 56100 }, { "epoch": 2.4005809234975013, "grad_norm": 9.794927597045898, "learning_rate": 1.1100476198009778e-05, "loss": 3.5751, "step": 56200 }, { "epoch": 2.4048524198026566, "grad_norm": 9.24485969543457, "learning_rate": 1.1021373538578368e-05, "loss": 3.6939, "step": 56300 }, { "epoch": 2.4091239161078124, "grad_norm": 6.9035162925720215, "learning_rate": 1.0942270879146956e-05, "loss": 3.6613, "step": 56400 }, { "epoch": 2.413395412412968, "grad_norm": 9.021778106689453, "learning_rate": 1.0863168219715548e-05, "loss": 3.5978, "step": 56500 }, { "epoch": 2.417666908718124, "grad_norm": 7.050608158111572, "learning_rate": 1.0784065560284137e-05, "loss": 3.7212, "step": 56600 }, { "epoch": 2.4219384050232797, "grad_norm": 8.771140098571777, "learning_rate": 1.0704962900852727e-05, "loss": 3.6447, "step": 56700 }, { "epoch": 2.4262099013284355, "grad_norm": 14.564820289611816, "learning_rate": 1.0625860241421317e-05, "loss": 3.6091, "step": 56800 }, { "epoch": 2.4304813976335913, "grad_norm": 10.664299011230469, "learning_rate": 1.0546757581989907e-05, "loss": 3.6506, "step": 56900 }, { "epoch": 2.4347528939387466, "grad_norm": 14.445178985595703, "learning_rate": 1.0467654922558497e-05, "loss": 3.6226, "step": 57000 }, { "epoch": 2.4347528939387466, "eval_runtime": 404.9602, "eval_samples_per_second": 115.621, "eval_steps_per_second": 14.453, "step": 57000 }, { "epoch": 2.4390243902439024, "grad_norm": 19.93160057067871, "learning_rate": 1.0388552263127086e-05, "loss": 3.6144, "step": 57100 }, { "epoch": 2.443295886549058, "grad_norm": 7.793177604675293, "learning_rate": 1.0309449603695676e-05, "loss": 3.6577, "step": 57200 }, { "epoch": 2.447567382854214, "grad_norm": 7.95759391784668, "learning_rate": 1.0230346944264268e-05, "loss": 3.6079, "step": 57300 }, { "epoch": 2.4518388791593697, "grad_norm": 10.07507610321045, "learning_rate": 1.0151244284832856e-05, "loss": 3.5974, "step": 57400 }, { "epoch": 2.456110375464525, "grad_norm": 9.73681926727295, "learning_rate": 1.0072141625401446e-05, "loss": 3.5602, "step": 57500 }, { "epoch": 2.460381871769681, "grad_norm": 18.652366638183594, "learning_rate": 9.993038965970037e-06, "loss": 3.5649, "step": 57600 }, { "epoch": 2.4646533680748366, "grad_norm": 10.758431434631348, "learning_rate": 9.913936306538627e-06, "loss": 3.6587, "step": 57700 }, { "epoch": 2.4689248643799924, "grad_norm": 8.963933944702148, "learning_rate": 9.834833647107217e-06, "loss": 3.5872, "step": 57800 }, { "epoch": 2.473196360685148, "grad_norm": 12.521937370300293, "learning_rate": 9.755730987675805e-06, "loss": 3.6379, "step": 57900 }, { "epoch": 2.4774678569903035, "grad_norm": 8.87618350982666, "learning_rate": 9.676628328244396e-06, "loss": 3.6867, "step": 58000 }, { "epoch": 2.4774678569903035, "eval_runtime": 403.9214, "eval_samples_per_second": 115.919, "eval_steps_per_second": 14.49, "step": 58000 }, { "epoch": 2.4817393532954592, "grad_norm": 8.210921287536621, "learning_rate": 9.597525668812986e-06, "loss": 3.5951, "step": 58100 }, { "epoch": 2.486010849600615, "grad_norm": 11.452202796936035, "learning_rate": 9.518423009381576e-06, "loss": 3.573, "step": 58200 }, { "epoch": 2.490282345905771, "grad_norm": 6.497128486633301, "learning_rate": 9.439320349950166e-06, "loss": 3.5676, "step": 58300 }, { "epoch": 2.4945538422109266, "grad_norm": 10.434738159179688, "learning_rate": 9.360217690518757e-06, "loss": 3.6214, "step": 58400 }, { "epoch": 2.4988253385160824, "grad_norm": 10.927915573120117, "learning_rate": 9.281115031087345e-06, "loss": 3.6166, "step": 58500 }, { "epoch": 2.503096834821238, "grad_norm": 9.382610321044922, "learning_rate": 9.202012371655935e-06, "loss": 3.6148, "step": 58600 }, { "epoch": 2.5073683311263935, "grad_norm": 10.243247032165527, "learning_rate": 9.122909712224525e-06, "loss": 3.5586, "step": 58700 }, { "epoch": 2.5116398274315492, "grad_norm": 9.074312210083008, "learning_rate": 9.043807052793116e-06, "loss": 3.5202, "step": 58800 }, { "epoch": 2.515911323736705, "grad_norm": 8.498826026916504, "learning_rate": 8.964704393361706e-06, "loss": 3.6136, "step": 58900 }, { "epoch": 2.520182820041861, "grad_norm": 9.749957084655762, "learning_rate": 8.885601733930294e-06, "loss": 3.6116, "step": 59000 }, { "epoch": 2.520182820041861, "eval_runtime": 403.7151, "eval_samples_per_second": 115.978, "eval_steps_per_second": 14.498, "step": 59000 }, { "epoch": 2.5244543163470166, "grad_norm": 12.452668190002441, "learning_rate": 8.806499074498886e-06, "loss": 3.6322, "step": 59100 }, { "epoch": 2.528725812652172, "grad_norm": 10.466354370117188, "learning_rate": 8.727396415067475e-06, "loss": 3.623, "step": 59200 }, { "epoch": 2.5329973089573277, "grad_norm": 11.655385971069336, "learning_rate": 8.648293755636065e-06, "loss": 3.6222, "step": 59300 }, { "epoch": 2.5372688052624834, "grad_norm": 46.87141799926758, "learning_rate": 8.569191096204655e-06, "loss": 3.5937, "step": 59400 }, { "epoch": 2.541540301567639, "grad_norm": 7.815254211425781, "learning_rate": 8.490088436773245e-06, "loss": 3.6052, "step": 59500 }, { "epoch": 2.545811797872795, "grad_norm": 8.2904052734375, "learning_rate": 8.410985777341835e-06, "loss": 3.668, "step": 59600 }, { "epoch": 2.5500832941779503, "grad_norm": 7.5048017501831055, "learning_rate": 8.331883117910424e-06, "loss": 3.5193, "step": 59700 }, { "epoch": 2.554354790483106, "grad_norm": 8.502148628234863, "learning_rate": 8.252780458479014e-06, "loss": 3.5909, "step": 59800 }, { "epoch": 2.558626286788262, "grad_norm": 7.68582820892334, "learning_rate": 8.173677799047606e-06, "loss": 3.5942, "step": 59900 }, { "epoch": 2.5628977830934176, "grad_norm": 8.871585845947266, "learning_rate": 8.094575139616194e-06, "loss": 3.609, "step": 60000 }, { "epoch": 2.5628977830934176, "eval_runtime": 403.125, "eval_samples_per_second": 116.148, "eval_steps_per_second": 14.519, "step": 60000 }, { "epoch": 2.5671692793985734, "grad_norm": 11.707693099975586, "learning_rate": 8.015472480184783e-06, "loss": 3.559, "step": 60100 }, { "epoch": 2.5714407757037288, "grad_norm": 11.136000633239746, "learning_rate": 7.936369820753373e-06, "loss": 3.6089, "step": 60200 }, { "epoch": 2.575712272008885, "grad_norm": 8.095897674560547, "learning_rate": 7.857267161321965e-06, "loss": 3.5446, "step": 60300 }, { "epoch": 2.5799837683140403, "grad_norm": 9.27779769897461, "learning_rate": 7.778164501890555e-06, "loss": 3.6471, "step": 60400 }, { "epoch": 2.584255264619196, "grad_norm": 10.214181900024414, "learning_rate": 7.699061842459143e-06, "loss": 3.568, "step": 60500 }, { "epoch": 2.588526760924352, "grad_norm": 7.064481258392334, "learning_rate": 7.619959183027733e-06, "loss": 3.6581, "step": 60600 }, { "epoch": 2.5927982572295076, "grad_norm": 10.396333694458008, "learning_rate": 7.540856523596324e-06, "loss": 3.5857, "step": 60700 }, { "epoch": 2.5970697535346634, "grad_norm": 9.091317176818848, "learning_rate": 7.461753864164914e-06, "loss": 3.5121, "step": 60800 }, { "epoch": 2.6013412498398187, "grad_norm": 8.94211483001709, "learning_rate": 7.3826512047335035e-06, "loss": 3.5963, "step": 60900 }, { "epoch": 2.6056127461449745, "grad_norm": 9.317548751831055, "learning_rate": 7.303548545302094e-06, "loss": 3.6764, "step": 61000 }, { "epoch": 2.6056127461449745, "eval_runtime": 403.6369, "eval_samples_per_second": 116.0, "eval_steps_per_second": 14.501, "step": 61000 }, { "epoch": 2.6098842424501303, "grad_norm": 9.0656156539917, "learning_rate": 7.224445885870683e-06, "loss": 3.5781, "step": 61100 }, { "epoch": 2.614155738755286, "grad_norm": 12.859307289123535, "learning_rate": 7.145343226439273e-06, "loss": 3.553, "step": 61200 }, { "epoch": 2.618427235060442, "grad_norm": 10.962692260742188, "learning_rate": 7.0662405670078635e-06, "loss": 3.6146, "step": 61300 }, { "epoch": 2.622698731365597, "grad_norm": 11.84343147277832, "learning_rate": 6.987137907576453e-06, "loss": 3.6113, "step": 61400 }, { "epoch": 2.626970227670753, "grad_norm": 8.770605087280273, "learning_rate": 6.908035248145044e-06, "loss": 3.6307, "step": 61500 }, { "epoch": 2.6312417239759087, "grad_norm": 11.979937553405762, "learning_rate": 6.828932588713632e-06, "loss": 3.5754, "step": 61600 }, { "epoch": 2.6355132202810645, "grad_norm": 8.271350860595703, "learning_rate": 6.749829929282223e-06, "loss": 3.6004, "step": 61700 }, { "epoch": 2.6397847165862203, "grad_norm": 9.494888305664062, "learning_rate": 6.6707272698508125e-06, "loss": 3.5873, "step": 61800 }, { "epoch": 2.6440562128913756, "grad_norm": 8.384838104248047, "learning_rate": 6.591624610419403e-06, "loss": 3.7076, "step": 61900 }, { "epoch": 2.648327709196532, "grad_norm": 11.468506813049316, "learning_rate": 6.512521950987993e-06, "loss": 3.6043, "step": 62000 }, { "epoch": 2.648327709196532, "eval_runtime": 403.9819, "eval_samples_per_second": 115.901, "eval_steps_per_second": 14.488, "step": 62000 }, { "epoch": 2.652599205501687, "grad_norm": 9.485078811645508, "learning_rate": 6.433419291556582e-06, "loss": 3.6547, "step": 62100 }, { "epoch": 2.656870701806843, "grad_norm": 6.771136283874512, "learning_rate": 6.3543166321251725e-06, "loss": 3.6359, "step": 62200 }, { "epoch": 2.6611421981119987, "grad_norm": 7.749585151672363, "learning_rate": 6.275213972693762e-06, "loss": 3.6473, "step": 62300 }, { "epoch": 2.6654136944171545, "grad_norm": 9.156508445739746, "learning_rate": 6.196111313262352e-06, "loss": 3.5664, "step": 62400 }, { "epoch": 2.6696851907223103, "grad_norm": 7.322949409484863, "learning_rate": 6.117008653830942e-06, "loss": 3.6327, "step": 62500 }, { "epoch": 2.6739566870274656, "grad_norm": 9.038566589355469, "learning_rate": 6.0379059943995325e-06, "loss": 3.588, "step": 62600 }, { "epoch": 2.6782281833326214, "grad_norm": 9.974699020385742, "learning_rate": 5.9588033349681214e-06, "loss": 3.5183, "step": 62700 }, { "epoch": 2.682499679637777, "grad_norm": 15.095208168029785, "learning_rate": 5.879700675536712e-06, "loss": 3.6799, "step": 62800 }, { "epoch": 2.686771175942933, "grad_norm": 6.498071670532227, "learning_rate": 5.800598016105302e-06, "loss": 3.665, "step": 62900 }, { "epoch": 2.6910426722480887, "grad_norm": 10.172649383544922, "learning_rate": 5.721495356673892e-06, "loss": 3.5558, "step": 63000 }, { "epoch": 2.6910426722480887, "eval_runtime": 404.4896, "eval_samples_per_second": 115.756, "eval_steps_per_second": 14.47, "step": 63000 }, { "epoch": 2.695314168553244, "grad_norm": 9.67616081237793, "learning_rate": 5.6423926972424814e-06, "loss": 3.6391, "step": 63100 }, { "epoch": 2.6995856648584, "grad_norm": 8.725837707519531, "learning_rate": 5.563290037811072e-06, "loss": 3.5525, "step": 63200 }, { "epoch": 2.7038571611635556, "grad_norm": 7.677910327911377, "learning_rate": 5.484187378379661e-06, "loss": 3.5785, "step": 63300 }, { "epoch": 2.7081286574687113, "grad_norm": 9.097688674926758, "learning_rate": 5.405084718948252e-06, "loss": 3.6759, "step": 63400 }, { "epoch": 2.712400153773867, "grad_norm": 9.70285415649414, "learning_rate": 5.325982059516841e-06, "loss": 3.641, "step": 63500 }, { "epoch": 2.7166716500790224, "grad_norm": 8.540017127990723, "learning_rate": 5.246879400085431e-06, "loss": 3.6462, "step": 63600 }, { "epoch": 2.7209431463841782, "grad_norm": 9.38048267364502, "learning_rate": 5.167776740654021e-06, "loss": 3.6205, "step": 63700 }, { "epoch": 2.725214642689334, "grad_norm": 7.8036417961120605, "learning_rate": 5.088674081222611e-06, "loss": 3.5581, "step": 63800 }, { "epoch": 2.7294861389944898, "grad_norm": 8.558833122253418, "learning_rate": 5.009571421791201e-06, "loss": 3.6245, "step": 63900 }, { "epoch": 2.7337576352996455, "grad_norm": 10.551793098449707, "learning_rate": 4.93046876235979e-06, "loss": 3.6178, "step": 64000 }, { "epoch": 2.7337576352996455, "eval_runtime": 403.4664, "eval_samples_per_second": 116.049, "eval_steps_per_second": 14.507, "step": 64000 }, { "epoch": 2.7380291316048013, "grad_norm": 7.700866222381592, "learning_rate": 4.851366102928381e-06, "loss": 3.5907, "step": 64100 }, { "epoch": 2.742300627909957, "grad_norm": 10.438343048095703, "learning_rate": 4.772263443496971e-06, "loss": 3.6566, "step": 64200 }, { "epoch": 2.7465721242151124, "grad_norm": 10.483076095581055, "learning_rate": 4.693160784065561e-06, "loss": 3.5357, "step": 64300 }, { "epoch": 2.750843620520268, "grad_norm": 10.251741409301758, "learning_rate": 4.61405812463415e-06, "loss": 3.5436, "step": 64400 }, { "epoch": 2.755115116825424, "grad_norm": 7.065600872039795, "learning_rate": 4.534955465202741e-06, "loss": 3.588, "step": 64500 }, { "epoch": 2.7593866131305798, "grad_norm": 6.57476282119751, "learning_rate": 4.45585280577133e-06, "loss": 3.5464, "step": 64600 }, { "epoch": 2.7636581094357355, "grad_norm": 10.847752571105957, "learning_rate": 4.376750146339921e-06, "loss": 3.6241, "step": 64700 }, { "epoch": 2.767929605740891, "grad_norm": 9.701374053955078, "learning_rate": 4.29764748690851e-06, "loss": 3.6638, "step": 64800 }, { "epoch": 2.7722011020460466, "grad_norm": 8.982709884643555, "learning_rate": 4.2185448274771e-06, "loss": 3.5764, "step": 64900 }, { "epoch": 2.7764725983512024, "grad_norm": 11.895380973815918, "learning_rate": 4.13944216804569e-06, "loss": 3.6685, "step": 65000 }, { "epoch": 2.7764725983512024, "eval_runtime": 404.1638, "eval_samples_per_second": 115.849, "eval_steps_per_second": 14.482, "step": 65000 }, { "epoch": 2.780744094656358, "grad_norm": 7.826374053955078, "learning_rate": 4.06033950861428e-06, "loss": 3.6005, "step": 65100 }, { "epoch": 2.785015590961514, "grad_norm": 9.21237564086914, "learning_rate": 3.98123684918287e-06, "loss": 3.5123, "step": 65200 }, { "epoch": 2.7892870872666693, "grad_norm": 9.817154884338379, "learning_rate": 3.902134189751459e-06, "loss": 3.6442, "step": 65300 }, { "epoch": 2.793558583571825, "grad_norm": 9.74837875366211, "learning_rate": 3.823031530320049e-06, "loss": 3.5619, "step": 65400 }, { "epoch": 2.797830079876981, "grad_norm": 8.357489585876465, "learning_rate": 3.74392887088864e-06, "loss": 3.613, "step": 65500 }, { "epoch": 2.8021015761821366, "grad_norm": 10.162979125976562, "learning_rate": 3.664826211457229e-06, "loss": 3.636, "step": 65600 }, { "epoch": 2.8063730724872924, "grad_norm": 9.95310115814209, "learning_rate": 3.5857235520258194e-06, "loss": 3.7321, "step": 65700 }, { "epoch": 2.8106445687924477, "grad_norm": 8.15718936920166, "learning_rate": 3.5066208925944088e-06, "loss": 3.6125, "step": 65800 }, { "epoch": 2.814916065097604, "grad_norm": 11.377549171447754, "learning_rate": 3.427518233162999e-06, "loss": 3.6859, "step": 65900 }, { "epoch": 2.8191875614027593, "grad_norm": 7.227240562438965, "learning_rate": 3.348415573731589e-06, "loss": 3.6915, "step": 66000 }, { "epoch": 2.8191875614027593, "eval_runtime": 403.9368, "eval_samples_per_second": 115.914, "eval_steps_per_second": 14.49, "step": 66000 }, { "epoch": 2.823459057707915, "grad_norm": 9.386198043823242, "learning_rate": 3.2693129143001786e-06, "loss": 3.5728, "step": 66100 }, { "epoch": 2.827730554013071, "grad_norm": 9.223987579345703, "learning_rate": 3.1902102548687688e-06, "loss": 3.6028, "step": 66200 }, { "epoch": 2.8320020503182266, "grad_norm": 7.933398246765137, "learning_rate": 3.1111075954373586e-06, "loss": 3.6051, "step": 66300 }, { "epoch": 2.8362735466233824, "grad_norm": 8.923637390136719, "learning_rate": 3.0320049360059483e-06, "loss": 3.6427, "step": 66400 }, { "epoch": 2.8405450429285377, "grad_norm": 9.507107734680176, "learning_rate": 2.9529022765745386e-06, "loss": 3.6082, "step": 66500 }, { "epoch": 2.8448165392336935, "grad_norm": 8.927079200744629, "learning_rate": 2.8737996171431284e-06, "loss": 3.6545, "step": 66600 }, { "epoch": 2.8490880355388493, "grad_norm": 11.978940963745117, "learning_rate": 2.7946969577117186e-06, "loss": 3.6035, "step": 66700 }, { "epoch": 2.853359531844005, "grad_norm": 8.081381797790527, "learning_rate": 2.7155942982803084e-06, "loss": 3.5326, "step": 66800 }, { "epoch": 2.857631028149161, "grad_norm": 7.906234264373779, "learning_rate": 2.6364916388488986e-06, "loss": 3.5, "step": 66900 }, { "epoch": 2.861902524454316, "grad_norm": 7.646207809448242, "learning_rate": 2.5573889794174884e-06, "loss": 3.6673, "step": 67000 }, { "epoch": 2.861902524454316, "eval_runtime": 403.9147, "eval_samples_per_second": 115.921, "eval_steps_per_second": 14.491, "step": 67000 }, { "epoch": 2.866174020759472, "grad_norm": 7.473504543304443, "learning_rate": 2.478286319986078e-06, "loss": 3.5667, "step": 67100 }, { "epoch": 2.8704455170646277, "grad_norm": 7.414896011352539, "learning_rate": 2.399183660554668e-06, "loss": 3.703, "step": 67200 }, { "epoch": 2.8747170133697835, "grad_norm": 10.893074035644531, "learning_rate": 2.3200810011232577e-06, "loss": 3.5693, "step": 67300 }, { "epoch": 2.8789885096749392, "grad_norm": 9.107102394104004, "learning_rate": 2.240978341691848e-06, "loss": 3.5448, "step": 67400 }, { "epoch": 2.8832600059800946, "grad_norm": 12.374472618103027, "learning_rate": 2.1618756822604377e-06, "loss": 3.5897, "step": 67500 }, { "epoch": 2.887531502285251, "grad_norm": 10.249347686767578, "learning_rate": 2.0827730228290275e-06, "loss": 3.6645, "step": 67600 }, { "epoch": 2.891802998590406, "grad_norm": 10.349568367004395, "learning_rate": 2.0036703633976173e-06, "loss": 3.5886, "step": 67700 }, { "epoch": 2.896074494895562, "grad_norm": 9.08791732788086, "learning_rate": 1.9245677039662075e-06, "loss": 3.6344, "step": 67800 }, { "epoch": 2.9003459912007177, "grad_norm": 23.89297866821289, "learning_rate": 1.8454650445347973e-06, "loss": 3.5352, "step": 67900 }, { "epoch": 2.9046174875058735, "grad_norm": 7.383826732635498, "learning_rate": 1.7663623851033873e-06, "loss": 3.5756, "step": 68000 }, { "epoch": 2.9046174875058735, "eval_runtime": 403.9584, "eval_samples_per_second": 115.908, "eval_steps_per_second": 14.489, "step": 68000 }, { "epoch": 2.9088889838110292, "grad_norm": 10.697755813598633, "learning_rate": 1.6872597256719771e-06, "loss": 3.7499, "step": 68100 }, { "epoch": 2.9131604801161846, "grad_norm": 8.796770095825195, "learning_rate": 1.608157066240567e-06, "loss": 3.6521, "step": 68200 }, { "epoch": 2.9174319764213403, "grad_norm": 9.28873348236084, "learning_rate": 1.529054406809157e-06, "loss": 3.6073, "step": 68300 }, { "epoch": 2.921703472726496, "grad_norm": 9.964879035949707, "learning_rate": 1.449951747377747e-06, "loss": 3.5816, "step": 68400 }, { "epoch": 2.925974969031652, "grad_norm": 8.402993202209473, "learning_rate": 1.3708490879463367e-06, "loss": 3.5752, "step": 68500 }, { "epoch": 2.9302464653368077, "grad_norm": 11.395208358764648, "learning_rate": 1.2917464285149267e-06, "loss": 3.5633, "step": 68600 }, { "epoch": 2.934517961641963, "grad_norm": 8.817408561706543, "learning_rate": 1.2126437690835167e-06, "loss": 3.6463, "step": 68700 }, { "epoch": 2.9387894579471188, "grad_norm": 12.337442398071289, "learning_rate": 1.1335411096521067e-06, "loss": 3.6794, "step": 68800 }, { "epoch": 2.9430609542522745, "grad_norm": 8.89869213104248, "learning_rate": 1.0544384502206965e-06, "loss": 3.4842, "step": 68900 }, { "epoch": 2.9473324505574303, "grad_norm": 8.63214111328125, "learning_rate": 9.753357907892865e-07, "loss": 3.5863, "step": 69000 }, { "epoch": 2.9473324505574303, "eval_runtime": 404.2191, "eval_samples_per_second": 115.833, "eval_steps_per_second": 14.48, "step": 69000 }, { "epoch": 2.951603946862586, "grad_norm": 8.96723747253418, "learning_rate": 8.962331313578763e-07, "loss": 3.6939, "step": 69100 }, { "epoch": 2.9558754431677414, "grad_norm": 9.602298736572266, "learning_rate": 8.171304719264663e-07, "loss": 3.7252, "step": 69200 }, { "epoch": 2.960146939472897, "grad_norm": 12.317747116088867, "learning_rate": 7.380278124950561e-07, "loss": 3.6703, "step": 69300 }, { "epoch": 2.964418435778053, "grad_norm": 7.297631740570068, "learning_rate": 6.589251530636461e-07, "loss": 3.6542, "step": 69400 }, { "epoch": 2.9686899320832087, "grad_norm": 8.46646499633789, "learning_rate": 5.79822493632236e-07, "loss": 3.6304, "step": 69500 }, { "epoch": 2.9729614283883645, "grad_norm": 9.025291442871094, "learning_rate": 5.007198342008259e-07, "loss": 3.5323, "step": 69600 }, { "epoch": 2.9772329246935203, "grad_norm": 8.952362060546875, "learning_rate": 4.216171747694158e-07, "loss": 3.5854, "step": 69700 }, { "epoch": 2.981504420998676, "grad_norm": 6.965066432952881, "learning_rate": 3.4251451533800567e-07, "loss": 3.4377, "step": 69800 }, { "epoch": 2.9857759173038314, "grad_norm": 10.185367584228516, "learning_rate": 2.6341185590659557e-07, "loss": 3.5891, "step": 69900 }, { "epoch": 2.990047413608987, "grad_norm": 11.02718734741211, "learning_rate": 1.8430919647518552e-07, "loss": 3.5265, "step": 70000 }, { "epoch": 2.990047413608987, "eval_runtime": 403.8967, "eval_samples_per_second": 115.926, "eval_steps_per_second": 14.491, "step": 70000 } ], "logging_steps": 100, "max_steps": 70233, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.3668072761491424e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }